{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "# Word Frequency in ROCO\n",
    "应该统计 word frequency 比较好, 我当时考虑输入到 model 的是 token 而不是 word. 但是确实没有必要做 token 的统计"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 23,
   "metadata": {},
   "outputs": [],
   "source": [
    "import pandas as pd\n",
    "import numpy as np\n",
    "from pathlib import Path\n",
    "from tqdm import tqdm\n",
    "import json"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 24,
   "metadata": {},
   "outputs": [],
   "source": [
    "test_filepath = \"/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/test_keywords.csv\"\n",
    "valid_filepath = \"/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/valid_keywords.csv\"\n",
    "train_filepath = \"/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/train_keywords.csv\"\n",
    "\n",
    "df_test = pd.read_csv(test_filepath, sep=',')\n",
    "df_valid = pd.read_csv(valid_filepath, sep=',')\n",
    "df_train = pd.read_csv(train_filepath, sep=',')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 25,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/html": [
       "<div>\n",
       "<style scoped>\n",
       "    .dataframe tbody tr th:only-of-type {\n",
       "        vertical-align: middle;\n",
       "    }\n",
       "\n",
       "    .dataframe tbody tr th {\n",
       "        vertical-align: top;\n",
       "    }\n",
       "\n",
       "    .dataframe thead th {\n",
       "        text-align: right;\n",
       "    }\n",
       "</style>\n",
       "<table border=\"1\" class=\"dataframe\">\n",
       "  <thead>\n",
       "    <tr style=\"text-align: right;\">\n",
       "      <th></th>\n",
       "      <th>word</th>\n",
       "      <th>count</th>\n",
       "    </tr>\n",
       "  </thead>\n",
       "  <tbody>\n",
       "    <tr>\n",
       "      <th>0</th>\n",
       "      <td>arrow</td>\n",
       "      <td>1513</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>1</th>\n",
       "      <td>right</td>\n",
       "      <td>1296</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>2</th>\n",
       "      <td>ct</td>\n",
       "      <td>1282</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>3</th>\n",
       "      <td>image</td>\n",
       "      <td>1185</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>4</th>\n",
       "      <td>left</td>\n",
       "      <td>1016</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>...</th>\n",
       "      <td>...</td>\n",
       "      <td>...</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10652</th>\n",
       "      <td>zstack</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10653</th>\n",
       "      <td>zubal</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10654</th>\n",
       "      <td>zy</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10655</th>\n",
       "      <td>zygoma</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "    <tr>\n",
       "      <th>10656</th>\n",
       "      <td>zygomaticomaxillary</td>\n",
       "      <td>1</td>\n",
       "    </tr>\n",
       "  </tbody>\n",
       "</table>\n",
       "<p>10657 rows × 2 columns</p>\n",
       "</div>"
      ],
      "text/plain": [
       "                      word  count\n",
       "0                    arrow   1513\n",
       "1                    right   1296\n",
       "2                       ct   1282\n",
       "3                    image   1185\n",
       "4                     left   1016\n",
       "...                    ...    ...\n",
       "10652               zstack      1\n",
       "10653                zubal      1\n",
       "10654                   zy      1\n",
       "10655               zygoma      1\n",
       "10656  zygomaticomaxillary      1\n",
       "\n",
       "[10657 rows x 2 columns]"
      ]
     },
     "execution_count": 25,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "df_test"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## Histgram of Word Frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 26,
   "metadata": {},
   "outputs": [],
   "source": [
    "import matplotlib.pyplot as plt"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 28,
   "metadata": {},
   "outputs": [],
   "source": [
    "def normalize(word_count):\n",
    "    max_count, min_count = max(word_count), min(word_count)\n",
    "    gap = max_count - min_count\n",
    "    word_frequency = [(count - min_count) / gap for count in word_count]\n",
    "    return word_frequency"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 56,
   "metadata": {},
   "outputs": [],
   "source": [
    "def visualize(df, dataset='TestSet'):\n",
    "    num_caption = {\n",
    "        'TestSet': 10657,\n",
    "        'ValidSet': 10285,\n",
    "        'TrainSet': 32914\n",
    "    }[dataset]\n",
    "    wordcount = df['count'].tolist()\n",
    "    print(f\"{len(wordcount)} different words in {dataset}\")\n",
    "    total_cnt = sum(wordcount)\n",
    "    frequency = [cnt / num_caption for cnt in wordcount]\n",
    "    # normal_frequency = normalize(wordcount)\n",
    "\n",
    "    df['frequency'] = frequency\n",
    "\n",
    "    plt.hist(frequency, bins=200, color='black', alpha=0.7)\n",
    "    plt.xlabel('Word Frequency')\n",
    "    plt.ylabel('Num')\n",
    "    plt.title(f'Word Frequency in {dataset} per case')\n",
    "    plt.show()\n",
    "\n",
    "    # plt.hist([fre for fre in frequency if fre < 0.001], bins=200, color='black', alpha=0.7)\n",
    "    # plt.xlabel('Word Frequency')\n",
    "    # plt.ylabel('Num')\n",
    "    # plt.title(f'Word Frequency in {dataset}')\n",
    "    # plt.show()\n",
    "    return df"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10657 different words in TestSet\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAb9UlEQVR4nO3df5QlZX3n8feHAQERFggDAgMOClEBjeKI+CMJEYxExTFuiGSNImJYPcZo1DWo2ZWcLIk5idGYKFlCFFCRIBoFEqIEAwgiMCCK/JJRCDPOAEOQH5IIgt/9o56Roqe7q2e6b0/PzPt1zj1d96nnqfren59bVfdWp6qQJGkym63vAiRJc59hIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYaJ0kOT7Jp9d3HXNZkvclOXl91yHNBMNiI5HkvUn+eUzbzRO0HTniWg5O8tMkP+pdzhnlOueiqvqTqnrT2o5Lcl7vfvtJkod61/92HZa3RrAn2S/JV5L8MMk9Sa5K8rIpLu/WJIeubR3asG2+vgvQjLkYOC7JvKp6JMkTgS2AA8a07d36TlmSzavq4bWsZ0VVLRjBcjd6VfVrq6eTnAIsr6o/nOHVnAOcCLyiXX8ukBlex7T5HJk73LLYeFxJFw7Patd/Cfg34KYxbd+rqhVJdktydpK7kyxN8jurF9Q+iZ6V5NNJ7gPekGSvJBcluT/J+cBOa1tgkjckuTTJh5PcDRyfZMskf5HktiR3JPnbJFv3xvyvJCuTrEjyxiSVZO8278Ikbxqz/Et615+W5Px2G29K8pu9eack+ViSf2q36fIkT+nN36839o62S+mJSf4zyc/1+j0nyaokW4xze3/2iT7Jwlb7Ue223pXk/etwH74iyTVta+DrSZ7Zm/cHSX7Qbs9NSQ5JchjwPuA1bcvkW0l2AvYC/q6qHmqXS6vqkqH1JPkUsCdwTlvee8ap8eAky9t9dlfbEnltb/6Ej3lv7B8kuR345AT3w+8kuaHd1uuTHNDaj0vyvV77r/fG7N2ew/e2uv6hN2/C54qaqvKykVzowuH32/TfAG8EThjT9ok2fRHwcWArujBZBRzS5h0P/AR4Fd0Hiq2By4C/BLakC537gU9PUMfBdJ+Gx7a/AXgYeBvdVu3WwEeAs4EdgW3pPvH+aet/GHAHsD+wDXA6UMDebf6FwJvGLP+SNr0NsAw4uq3rAOAuYL82/xTgbuDANv8zwBlt3rbASuBd7f7ZFnhem/fPwFt66/ww8NcT3A/Hr76PgIWt9r9rt/sXgAeBpw88pqcA/7dNHwDcCTwPmAccBdzaHpOnttu7W299TxlbR7se4Gbg3PYY7zJmnROup82/FTh0kpoPbo/z6ufLLwMPAE9t8yd7zFeP/bM2dutxln8E8AMe3RraG3hSb95udM/b17T17trmfRZ4f5u3FfCiqTxXvLT7fX0X4GUGH8zuTeEf2/S3gH3o3nD7bUcBewCPANv2xv4pcEpvORf35u3ZXsDb9NpOZ/Kw+ClwT+/ym3Rv5rf1+qW9mJ/Sa3s+cEub/gTwwd68n2fqYfEa4Gtj6vp/wAfa9CnAyb15LwNubNO/BXxzgtv2GuDSNj0PuB04cJLHY2xYLOjNvwI4cuAxPYVHw+JE4I/HzL+J7s14b7o3+EOBLSaqo9e2gO7Dw/faY3UxsM/Qetr0rUwtLPrPlzOB/z2Fx/xg4CFgq0mW/2Xg7VN8TVwDLG7TpwEn9R+DqTxXvHQXd0NtXC4GXpRkB2B+Vd0MfB14QWvbv/XZDbi7qu7vjf13YPfe9WW96d2AH1bVA2P6T2ZFVW3fu5w5znLnA48Hrmq7O+4B/qW1r15vv//QOvueBDxv9XLbsl8LPLHX5/be9H8CT2jTe9C9iY7nS8C+SZ4MvAS4t6quWIu6JlrnVDwJeNeY27QH3dbEUuAddMFwZ5Izkuw20YKqanlV/W5VPaUt9wG6N9NJ17MWtY73fNmN4cccYFVV/XiSZU/4+CR5fW/32T10z/nVu0zfQxdWVyS5LskbW/tUniubPA9wb1wuA/4bcCxwKUBV3ZdkRWtbUVW3JHkY2DHJtr3A2JNu0361/umIVwI7JNmm9waw55g+U9UfcxfwX3Sb+z8Yp+9KujeG1fYcM/8Bujee1fov7mXARVX1knWocRnd1sUaqurHSc6kezN5GvCpdVj+uloGnFBVJ0xQ2+nA6Um2o/tk/GfA6xh4nKpqWZKP0e2mGVzP0PKa8Z4v32H4MZ/K8pcBTxnbmORJdLv5DgEuq+5LHdfQDtxX1e3A77S+LwL+NcnFTO+5sslwy2IjUlX/BSwB3gl8rTfrktZ2ceu3jG6L40+TbNUOXh5Dt99+vOX+e1vuHyV5XHuhHT4D9f6U7sX94SQ7AyTZPclLW5cz6Q6u75vk8cAHxiziGuDVSR6f7qD3Mb155wI/n+R1SbZol+cmefoUSjsXeGKSd7SDsdsmeV5v/ml0u7xeCczmb03+Dnhzkuels02Sl7f6nprkxUm2BH5M94b8SBt3B7AwyWYASXZI8kftgO9m7YD3G4FvDK2nt7wnT6He1c+XX6T71tXnpvCYT8XJwLvTfbkg7XY8ie7YQ9EdfyPJ0XRbFrTrRyRZ/Q29H7a+jzC958omw7DY+FwE7EwXEKt9rbX1vzL7W3T70VcA/0i3f/b8SZb7P+gOeN5N96Z92iR918YfAEuBb6T75tW/0h2sparOozsY+tXW56tjxn6Ybv/2HcCp9MKubTH9KnAk3W28nUcPmk6qjX0JXSDeTncw+Fd68y+l289/dVXdunY3d91V1RK6T8Z/Q/dmt5QutKC7XR+k++R+O93j/b4273Pt738kuZruPltId1/fR/eJ/8HVyxpYD3THt/6w7bJ59wTl3t7GrqB7XN5cVTe2eRM+5lO8Hz5H98WN0+m+aPFFYMequh74EN0W9h3AM2hb2M1zgcuT/IjuAPvbq+qW6TxXNiVpB3OkDUKSojsQu3Q91/FV4PSq8hfaYyQ5mO6A+qS/s9GGxWMW0lpK8ly6r1cuXt+1SLPF3VDSWkhyKt1uk3eM+TaZtFFzN5QkaZBbFpKkQRvtMYuddtqpFi5cuL7LkKQNylVXXXVXVc0f277RhsXChQtZsmTJ+i5DkjYoScY9U4K7oSRJgwwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDDItxHH74tP8JnCRtVAwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDDAtJ0iDDQpI0yLCQJA0yLCRJgwwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDDAtJ0iDDQpI0yLCQJA0aaVgk+f0k1yX5TpLPJtkqyY5Jzk9yc/u7Q6//e5MsTXJTkpf22p+T5No276NJMsq6JUmPNbKwSLI78HvAoqraH5gHHAkcB1xQVfsAF7TrJNm3zd8POAz4eJJ5bXEnAscC+7TLYaOqW5K0plHvhtoc2DrJ5sDjgRXAYuDUNv9U4FVtejFwRlU9WFW3AEuBA5PsCmxXVZdVVQGn9cZIkmbByMKiqn4A/AVwG7ASuLeqvgLsUlUrW5+VwM5tyO7Ast4ilre23dv02PY1JDk2yZIkS1atWjWTN0eSNmmj3A21A93Wwl7AbsA2SX57siHjtNUk7Ws2Vp1UVYuqatH8+fPXtmRJ0gRGuRvqUOCWqlpVVT8BvgC8ALij7Vqi/b2z9V8O7NEbv4But9XyNj22XZI0S0YZFrcBByV5fPv20iHADcDZwFGtz1HAl9r02cCRSbZMshfdgewr2q6q+5Mc1Jbz+t4YSdIs2HxUC66qy5OcBVwNPAx8EzgJeAJwZpJj6ALliNb/uiRnAte3/m+tqkfa4t4CnAJsDZzXLpKkWTKysACoqg8AHxjT/CDdVsZ4/U8AThinfQmw/4wXKEmaEn/BLUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRpkWEiSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRpkWEiSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRpkWEiSBhkWkqRBhoUkadBIwyLJ9knOSnJjkhuSPD/JjknOT3Jz+7tDr/97kyxNclOSl/ban5Pk2jbvo0kyyrolSY816i2LvwL+paqeBvwCcANwHHBBVe0DXNCuk2Rf4EhgP+Aw4ONJ5rXlnAgcC+zTLoeNuG5JUs/IwiLJdsAvAX8PUFUPVdU9wGLg1NbtVOBVbXoxcEZVPVhVtwBLgQOT7ApsV1WXVVUBp/XGSJJmwSi3LJ4MrAI+meSbSU5Osg2wS1WtBGh/d279dweW9cYvb227t+mx7WtIcmySJUmWrFq1amZvjSRtwkYZFpsDBwAnVtWzgQdou5wmMN5xiJqkfc3GqpOqalFVLZo/f/7a1itJmsAow2I5sLyqLm/Xz6ILjzvariXa3zt7/ffojV8ArGjtC8ZplyTNkpGFRVXdDixL8tTWdAhwPXA2cFRrOwr4Ups+GzgyyZZJ9qI7kH1F21V1f5KD2regXt8bI0maBZuPePlvAz6T5HHA94Gj6QLqzCTHALcBRwBU1XVJzqQLlIeBt1bVI205bwFOAbYGzmsXSdIsGWlYVNU1wKJxZh0yQf8TgBPGaV8C7D+jxUmSpsxfcEuSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRo0pRMJtlOGvw1Y2B9TVa8cTVmSpLlkqmed/SLd/9I+B/jpyKqRJM1JUw2LH1fVR0daiSRpzppqWPxVkg8AXwEeXN1YVVePpCpJ0pwy1bB4BvA64MU8uhuq2nVJ0kZuqmHx68CTq+qhURYjSZqbpvrV2W8B24+wDknSHDbVLYtdgBuTXMljj1n41VlJ2gRMNSw+MNIqJElz2pTCoqouGnUhkqS5a6q/4L6f7ttPAI8DtgAeqKrtRlWYJGnumOqWxbb960leBRw4ioIkSXPPOp1IsKq+iL+xkKRNxlR3Q726d3UzYBGP7paSJG3kpvptqMN70w8DtwKLZ7waSdKcNNVjFkePuhBJ0tw1aVgk+T+TzK6q+uMZrkeSNAcNbVk8ME7bNsAxwM8BhoUkbQImDYuq+tDq6STbAm8HjgbOAD400ThJ0sZl8JhFkh2BdwKvBU4FDqiqH466MEnS3DF0zOLPgVcDJwHPqKofzUpVkqQ5ZehHee8CdgP+EFiR5L52uT/JfaMvT5I0Fwwds1inX3hLkjYuhoEkadDIwyLJvCTfTHJuu75jkvOT3Nz+7tDr+94kS5PclOSlvfbnJLm2zftokoy6bknSo2Zjy+LtwA2968cBF1TVPsAF7TpJ9gWOBPYDDgM+nmReG3MicCywT7scNgt1S5KakYZFkgXAy4GTe82L6b6CS/v7ql77GVX1YFXdAiwFDkyyK7BdVV1WVQWc1hsjSZoFo96y+AjwHuCnvbZdqmolQPu7c2vfHVjW67e8te3epse2ryHJsUmWJFmyatWqGbkBkqQRhkWSVwB3VtVVUx0yTltN0r5mY9VJVbWoqhbNnz9/iquVJA2Z6inK18ULgVcmeRmwFbBdkk8DdyTZtapWtl1Md7b+y4E9euMXACta+4Jx2iVJs2RkWxZV9d6qWlBVC+kOXH+1qn4bOBs4qnU7CvhSmz4bODLJlkn2ojuQfUXbVXV/koPat6Be3xsjSZoFo9yymMgHgTOTHAPcBhwBUFXXJTkTuJ7uHyy9taoeaWPeApwCbA2c1y6SpFkyK2FRVRcCF7bp/wAOmaDfCcAJ47QvAfYfXYWSpMn4C25J0iDDQpI0yLCQJA0yLCRJgwwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDDAtJ0iDDQpI0yLCQJA0yLCRJgwwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDDAtJ0iDDQpI0yLCQJA0yLCRJgwwLSdIgw0KSNMiwkCQNMiwkSYMMC0nSIMNCkjTIsJAkDTIsJEmDRhYWSfZI8m9JbkhyXZK3t/Ydk5yf5Ob2d4femPcmWZrkpiQv7bU/J8m1bd5Hk2RUdUuS1jTKLYuHgXdV1dOBg4C3JtkXOA64oKr2AS5o12nzjgT2Aw4DPp5kXlvWicCxwD7tctgI65YkjTGysKiqlVV1dZu+H7gB2B1YDJzaup0KvKpNLwbOqKoHq+oWYClwYJJdge2q6rKqKuC03hhJ0iyYlWMWSRYCzwYuB3apqpXQBQqwc+u2O7CsN2x5a9u9TY9tH289xyZZkmTJqlWrZvQ2SNKmbORhkeQJwOeBd1TVfZN1HaetJmlfs7HqpKpaVFWL5s+fv/bFSpLGNdKwSLIFXVB8pqq+0JrvaLuWaH/vbO3LgT16wxcAK1r7gnHaJUmzZJTfhgrw98ANVfWXvVlnA0e16aOAL/Xaj0yyZZK96A5kX9F2Vd2f5KC2zNf3xkiSZsHmI1z2C4HXAdcmuaa1vQ/4IHBmkmOA24AjAKrquiRnAtfTfZPqrVX1SBv3FuAUYGvgvHaRJM2SkYVFVV3C+McbAA6ZYMwJwAnjtC8B9p+56iRJa8NfcEuSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRpkWEiSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQYaFJGmQYSFJGmRYSJIGGRaSpEGGhSRpkGEhSRpkWEiSBhkWkqRBhsUEDj/8cA4//PD1XYYkzQmGhSRpkGEhSRpkWEiSBhkWkqRBhoUkaZBhIUkaZFhIkgYZFgP8rYUkGRaSpCnYYMIiyWFJbkqyNMlxs7luf80taVO3+fouYCqSzAM+BrwEWA5cmeTsqrp+NuvoB8Y555wzm6uWpPVqgwgL4EBgaVV9HyDJGcBiYFbDom9ttjTOOeecn/U3ZCRtiDaUsNgdWNa7vhx43thOSY4Fjm1Xf5TkpnVc307AXes4dg1Jxp2eITNa64htKLVuKHWCtY7Kplzrk8Zr3FDCYrx32Fqjoeok4KRpryxZUlWLpruc2WCtM29DqROsdVSsdU0bygHu5cAevesLgBXrqRZJ2uRsKGFxJbBPkr2SPA44Ejh7PdckSZuMDWI3VFU9nOR3gS8D84BPVNV1I1zltHdlzSJrnXkbSp1graNirWOkao1d/5IkPcaGshtKkrQeGRaSpEGbVFgMnTIknY+2+d9OcsBUx86VWpPskeTfktyQ5Lokb5+rtfbmz0vyzSTnzuVak2yf5KwkN7b79/lzuNbfb4//d5J8NslW67nWpyW5LMmDSd69NmPnSq2z/dqazn3a5s/s66qqNokL3YHx7wFPBh4HfAvYd0yflwHn0f2u4yDg8qmOnUO17goc0Ka3Bb47V2vtzX8ncDpw7lx9DrR5pwJvatOPA7afi7XS/Yj1FmDrdv1M4A3rudadgecCJwDvXpuxc6jWWXttTafO3vwZfV1tSlsWPztlSFU9BKw+ZUjfYuC06nwD2D7JrlMcOydqraqVVXU1QFXdD9xA9+Yx52oFSLIAeDlw8ghrnHatSbYDfgn4e4Cqeqiq7pmLtbZ5mwNbJ9kceDyj/V3SYK1VdWdVXQn8ZG3HzpVaZ/m1NZ37dCSvq00pLMY7ZcjYB3qiPlMZO5OmU+vPJFkIPBu4fOZLnHodA30+ArwH+OmI6ptqHUN9ngysAj7ZNu1PTrLNXKy1qn4A/AVwG7ASuLeqvrKeax3F2HUxI+ubhdfWdOv8CDP8utqUwmIqpwyZqM+UTjcyg6ZTazczeQLweeAdVXXfDNY21jrXmuQVwJ1VddXMlzWu6dyvmwMHACdW1bOBB4BR7l+fzv26A92n0L2A3YBtkvz2DNc3WMcsjF0X017fLL221rnOUb2uNqWwmMopQybqM9unG5lOrSTZgu7J/Jmq+sII65y0jin0eSHwyiS30m1mvzjJp0dX6rSfA8uravUnybPowmNUplProcAtVbWqqn4CfAF4wXqudRRj18W01jeLr63p1Dma19UoDs7MxQvdJ8Pv033aWn3AaL8xfV7OYw8YXjHVsXOo1gCnAR+Z6/frmD4HM/oD3NOqFfga8NQ2fTzw53OxVrozMl9Hd6widAfm37Y+a+31PZ7HHjSec6+tSWqdtdfWdOocM2/GXlcjvcFz7UL37ZHv0n3L4P2t7c3Am3tPho+1+dcCiyYbOxdrBV5Et7n6beCadnnZXKx1zDJm7Ek9wufAs4Al7b79IrDDHK71j4Abge8AnwK2XM+1PpHu0/J9wD1teruJxs7FWmf7tTWd+7S3jBl7XXm6D0nSoE3pmIUkaR0ZFpKkQYaFJGmQYSFJGmRYSJIGGRba6CT5cJJ39K5/OcnJvesfSvLOdVz2weOdxbO135vkmnb513UqXpqjDAttjL5O+8Vyks2AnYD9evNfAFw6lQUlmbcW6/1aVT2rXQ4ds5wN4l8YSxMxLLQxupRHT2+xH90P0+5PskOSLYGnA99Mckg7KeC1ST7R5pHk1iT/J8klwBHt/wrc2K6/eqpFJHlDks8lOQf4SpJt2nqubOtd3PptneSM9j8p/iHJ5UkWtXk/6i3vN5Kc0qbnJ/l8W9aVSV7Y2o9v67gwyfeT/F5v/OvbOr6V5FNJtk1ySzuFBUm2a7d9i3W727Ux89OONjpVtSLJw0n2pAuNy+jO2Pl84F66X+BuBpwCHFJV301yGvAWurN1Avy4ql6U7p8G3Qy8GFgK/MMkq/7FJNe06c8BP2jrfGZV3Z3kT4CvVtUbk2wPXNF2V/1P4D+r6plJnglcPYWb+VfAh6vqknY7v0wXggBPA36F7n8u3JTkRODngfcDL6yqu5LsWFX3J7mQ7rQhXwSOBD5f3fmkpMdwy0Ibq9VbF6vD4rLe9a8DT6U72d53W/9T6f5fxWqrQ+Fprd/N1Z3uYLITsvV3Q53Q2s6vqrvb9K8Cx7VAuRDYCtizrffTAFX1bbowG3Io8DdtWWcD2yXZts37p6p6sKruAu4EdqELu7NaG72aTgaObtNHA5+cwrq1CXLLQhur1cctnkG3G2oZ8C668+h8gvFPAd33QG96OufE6S8nwH+vqpv6HZJMto5+e/9fo24GPL+q/mucZT3Ya3qE7nWe8dZRVZcmWZjkl4F5VfWdSW+NNlluWWhjdSnwCuDuqnqkfZLenm630GV0J9lbmGTv1v91wEXjLOdGYK8kT2nXf2saNX0ZeFvaO3qSZ7f2i4HXtrb9gWf2xtyR5OntQP2v99q/Avzu6itJnjWw7guA30zyc63/jr15pwGfxa0KTcKw0MbqWrpvQX1jTNu9VXVXVf2YbrfL55JcS/cfxf527EJav2OBf2oHuP99GjX9MbAF8O0k32nXAU4EnpDk23T/3eyK3pjjgHOBr9L917vVfg9Y1A5YX093NtIJVdV1dP+r+aIk3wL+sjf7M8AOdIEhjcuzzkpzTDvo/O6qWjJL6/sNYHFVvW421qcNk8cspE1Ykr8Gfo3ufydIE3LLQpI0yGMWkqRBhoUkaZBhIUkaZFhIkgYZFpKkQf8fDRsRiOIJMsYAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "10285 different words in ValidSet\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAYsAAAEWCAYAAACXGLsWAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAcxUlEQVR4nO3de7iVdZ338fdH8IAKAyYeOBiYqIk5HshjT+OEJaMRTpOFTymaDZPjZGbOjFpX2tVD01yTaT6pjWMq5BG1Eu2xNMxjKOIRUVESEwIV80SkmPR9/vj9tt4s9l6/tWGvzdrsz+u61rXv+3efvutea+3Pun/3WvdSRGBmZlbPRuu7ADMza30OCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhTVM0lmSLl/fdbQySWdIung9bPd2SV/Iw5+VdEsj85o1ymHRg0k6XdL/q2l7uoO2iU2u5WBJf5H0x8rtxmZusxVFxLcjotP/iPNjeWc77VtLekvS7p2o4YqI+FiD291E0tmSFufHbKGkcxpc1m8eehGHRc92J3CQpD4AkrYDNgb2rmnbKc/bMEl916KeJRGxZeU2vovW2xv8GDhQ0sia9onA3Ih4rEnbPR0YA+wL9Af+FnioSdtaa37erH8Oi57tflI47JnHPwz8Gphf0/bbiFgiaYikGZJelrRA0j+2rSi/S7xO0uWSXgeOlTRS0h2Slku6Fdi6swVKOlbSPZLOkfQycJakTSV9V9Jzkl6Q9ENJ/SrL/KukpZKWSPq8pJC0U562WhdKXv/dlfFdJd2a7+N8SZ+uTLtM0vmSfp7v032S3leZPrqy7Au5S2k7SX+S9J7KfPtIWiZp43bu7zvvtiWNyLVPyvf1JUlfa28/RcRi4Dbg6JpJxwBTJQ2SdFPe7it5eFidfV7dJx+V9KSk1yT9AFBl9g8CP42IJZE8GxHTKssOkXR93u5CSSfl9nHAGcBn8hHJIx3U8mw+ano8132ppM0q0z8u6WFJr0r6jaQ9apb9d0mPAivaC4z2HrPcvq+kWXm9SyX9QNImeZry8/HFvE8eVT5yKz03ezOHRQ8WEW8B95ECgfz3LuDumra2o4qrgMXAEOBTwLclja2scgJwHTAQuAK4EniAFBLfAiatZan7Ac8A2wBTgP8EdiYF2k7AUOAb8M4/oVOBjwKjgEMa3YikLYBbc93bAEcBF0gaXZntKOCbwCBgQa4HSf2BXwG/IO2fnYCZEfE8cDvw6co6PgdcHRF/brC0DwG7AGOBb0h6fwfzTaUSFpJ2Ie2jq0iv1UuB9wI7AG8APyhtWNLWwPXA10mP42+Bgyqz3AucIumfJX1AkirLbgTcCDxCeozGAidLOjQifgF8G7gmH0X+dZ0yPgscCryP9Lh/Pa9/b+AS4J+A9wD/DcyQtGll2aOAw4GBEfF2zX1r9zHLk1cBX8n3+YBc+z/naR8jvS52Jj3XPwP8IU/r8LnZ60WEbz34BpxFemcI6UU9ChhX0zYJGE56AfWvLPsfwGWV9dxZmbYD8DawRaXtSuDyDuo4GPgL8Grl9mngWOC5ynwCVgDvq7QdACzMw5cA36lM2xkIYKc8fjvwhcr0Y4G78/BngLtq6vpv4Mw8fBlwcWXaYcCTefgo4KEO7ttngHvycB/geWDfOo/H5Xl4RK59WGX6bGBiB8tuDrwOHJjHpwA3dDDvnsArlfF39kvNPjkGuLdm/y+uzNsHOBG4B1gJLAEm5Wn7VR+73HY6cGntfa3z/HwW+GLNPv9tHr4Q+FbN/POBv6ks+/k66+7wMWtn3pN59zXxEeApYH9go0afm7395n7Anu9O4ERJg4DBEfG0pBfIXRfA7nmeIcDLEbG8suzvSP3VbRZVhoeQ/hmtqJl/eJ1alkTEal0jko6tWe9g0j/FB6pvYkn/tNq2+0DNNhv1XmA/Sa9W2vqSzge0eb4y/Cdgyzw8nPSuuz03AD+UtCMpvF6LiNmdqKujba4mIv4k6VrgGEmzSO/ITwGQtDlwDumNwKC8SH9JfSJiVZ1tD6Gy/yMiJFXHVwHnA+fn7pbPA5dImk3an0Nq9mcf0tFrZ1Qf/9/lmsjrnyTpS5Xpm1Sm1y5bq8PHTNLOwPdIz+/NSc+DBwAi4rbcHXc+sIOkn5KOZjej/nOzV3M3VM83C/grYDLp3SER8TrpHeJk0j/whXl8q3zo3mYH4PeV8eoliJcCg3LXTnX+tVFd70ukLpTRETEw3/4qItr+gS5l9UCq3eYK0gu6zXaV4UXAHZX1DozURXJCAzUuInWTrFl8xJvAdNI/76NZPXy62lTSEdlHSSecb8rtXyV1Ze0XEQN4t5tRa6xhdavtz9zN1G7gR8QbEXE+8AqwG2mfLKzZn/0j4rC2RRq8T7WP55I8vAiYUrP+zSPiqmpZddbb4WNGOmp5EhiV99cZVPZVRJwXEfsAo0lvAP6V8nOzV3NY9HAR8QYwh/QOtPqO7+7cdmeebxHwG+A/JG2WTyQeTzo30d56f5fX+02lj1d+CFjj001rUe9fgP8BzpG0DYCkoZIOzbNMJ51c3y2/mz6zZhUPA5+UtLnSSe/jK9NuAnaWdLSkjfPtg3XOEVTdBGwn6eR8krO/pP0q06eRunc+ATTz46J3kbrwLiKdF3krt/cn/SN7VdJWrLlfOvJzYLSkT+YTxCdRCdh8fw+W1E9SX0mT8rYeInWZvZ5PMveT1EfS7pI+mBd/ARiRz23Uc6KkYbnuM4Brcvv/AF+UtF8+6byFpMNr3tDUU+8x60/q0vujpF2Bd94w5OfEfkofUFgBvAmsauC52as5LDYMd5BO6N5dabsrt1U/MnsUqR99CfBTUl/+rXXW+79J/dYvk/45Taszb2f8O+nk8r1Kn7z6FeldMxFxM3Au6ZNBC/LfqnOAt0j/qKZSCbvcxfYx0sdNl5C6f/4T2JSCvOxHSYH4PPA06WOkbdPvIZ2TeTAinu3c3W1cpI7yaaQumur+PhfoR3r3ey/ppG4j63sJOBL4Dukk7ijyEWj2BnA26T6/RDp/8Q8R8UzuohpPOj+yME+/mHQkC3Bt/vsHSQ/WKeNK4BbShxyeAf5Prm0O8I+kE/WvkB7vYxu5X3n5eo/ZqaTn73JSAFxTWXRAbnuF1C32B+C7eVqHz83eTum5ada6JAWpO2HBeq7jNuDKiOj2b2j3VJKeJZ1M/9X6rsXWjU9wmzUgd73sTfp4sVmv424oswJJU0ndESfXfJrMrNdwN5SZmRX5yMLMzIo22HMWW2+9dYwYMWJ9l2Fm1qM88MADL0XE4Nr2DTYsRowYwZw5c9Z3GWZmPYqkdq+a4G4oMzMrcliYmVmRw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYtGP8+HX+QTgzsw2Kw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysyGFhZmZFDgszMytqalhI+oqkeZIek3SVpM0kbSXpVklP57+DKvOfLmmBpPmSDq207yNpbp52niQ1s24zM1td08JC0lDgJGBMROwO9AEmAqcBMyNiFDAzjyNptzx9NDAOuEBSn7y6C4HJwKh8G9esus3MbE3N7obqC/ST1BfYHFgCTACm5ulTgSPy8ATg6ohYGRELgQXAvpK2BwZExKyICGBaZRkzM+sGTQuLiPg98F3gOWAp8FpE3AJsGxFL8zxLgW3yIkOBRZVVLM5tQ/NwbfsaJE2WNEfSnGXLlnXl3TEz69Wa2Q01iHS0MBIYAmwh6XP1FmmnLeq0r9kYcVFEjImIMYMHD+5syWZm1oFmdkMdAiyMiGUR8WfgJ8CBwAu5a4n898U8/2JgeGX5YaRuq8V5uLbdzMy6STPD4jlgf0mb508vjQWeAGYAk/I8k4Ab8vAMYKKkTSWNJJ3Inp27qpZL2j+v55jKMmZm1g36NmvFEXGfpOuAB4G3gYeAi4AtgemSjicFypF5/nmSpgOP5/lPjIhVeXUnAJcB/YCb883MzLpJ08ICICLOBM6saV5JOspob/4pwJR22ucAu3d5gWZm1hB/g9vMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysyGFhZmZFDgszMytyWJiZWZHDwszMihwWZmZW5LAwM7Mih4WZmRU5LMzMrMhhYWZmRQ4LMzMrcliYmVmRw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysyGFhZmZFTQ0LSQMlXSfpSUlPSDpA0laSbpX0dP47qDL/6ZIWSJov6dBK+z6S5uZp50lSM+s2M7PVNfvI4vvALyJiV+CvgSeA04CZETEKmJnHkbQbMBEYDYwDLpDUJ6/nQmAyMCrfxjW5bjMzq2haWEgaAHwY+BFARLwVEa8CE4CpebapwBF5eAJwdUSsjIiFwAJgX0nbAwMiYlZEBDCtsoyZmXWDZh5Z7AgsAy6V9JCkiyVtAWwbEUsB8t9t8vxDgUWV5RfntqF5uLZ9DZImS5ojac6yZcu69t6YmfVizQyLvsDewIURsRewgtzl1IH2zkNEnfY1GyMuiogxETFm8ODBna3XzMw60MywWAwsjoj78vh1pPB4IXctkf++WJl/eGX5YcCS3D6snXYzM+smTQuLiHgeWCRpl9w0FngcmAFMym2TgBvy8AxgoqRNJY0kncienbuqlkvaP38K6pjKMmZm1g36Nnn9XwKukLQJ8AxwHCmgpks6HngOOBIgIuZJmk4KlLeBEyNiVV7PCcBlQD/g5nwzM7Nu0tSwiIiHgTHtTBrbwfxTgCnttM8Bdu/S4szMrGH+BreZmRU5LMzMrMhhYWZmRQ4LMzMrcliYmVmRw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysqKGfVZU0kvR72iOqy0TEJ5pTlpmZtZJGf4P7Z8CPgBuBvzStGjMza0mNhsWbEXFeUysxM7OW1WhYfF/SmcAtwMq2xoh4sClVmZlZS2k0LD4AHA18hHe7oSKPm5nZBq7RsPh7YMeIeKuZxZiZWWtq9KOzjwADm1iHmZm1sEaPLLYFnpR0P6ufs/BHZ83MeoFGw+LMplZhZmYtraGwiIg7ml2ImZm1rka/wb2c9OkngE2AjYEVETGgWYWZmVnraPTIon91XNIRwL7NKMjMzFrPWl1IMCJ+hr9jYWbWazTaDfXJyuhGwBje7ZYyM7MNXKOfhhpfGX4beBaY0OXVmJlZS2r0nMVxzS7EzMxaV92wkPSNOpMjIr7VxfWYmVkLKh1ZrGinbQvgeOA9gMPCzKwXqBsWEXF227Ck/sCXgeOAq4GzO1rOzMw2LMVzFpK2Ak4BPgtMBfaOiFeaXZiZmbWOut+zkPRfwP3AcuADEXFWZ4NCUh9JD0m6KY9vJelWSU/nv4Mq854uaYGk+ZIOrbTvI2lunnaeJHXqXpqZ2TopfSnvq8AQ4OvAEkmv59tySa83uI0vA09Uxk8DZkbEKGBmHkfSbsBEYDQwDrhAUp+8zIXAZGBUvo1rcNtmZtYF6oZFRGwUEf0ion9EDKjc+jdyXShJw4DDgYsrzRNI3Vnkv0dU2q+OiJURsRBYAOwraXtgQETMiogAplWWMTOzbrBWl/vohHOBf+Pdn2IF2DYilgLkv9vk9qHAosp8i3Pb0Dxc274GSZMlzZE0Z9myZV1yB8zMrIlhIenjwIsR8UCji7TTFnXa12yMuCgixkTEmMGDBze4WTMzK2n0ch9r4yDgE5IOAzYDBki6HHhB0vYRsTR3Mb2Y518MDK8sPwxYktuHtdNuZmbdpGlHFhFxekQMi4gRpBPXt0XE54AZwKQ82yTghjw8A5goaVNJI0knsmfnrqrlkvbPn4I6prKMmZl1g2YeWXTkO8B0SccDzwFHAkTEPEnTgcdJFys8MSJW5WVOAC4D+gE355uZmXWTbgmLiLgduD0P/wEY28F8U4Ap7bTPAXZvXoVmZlZPsz8NZWZmGwCHhZmZFTkszMysyGFhZmZFDgszMytyWJiZWZHDwszMihwWZmZW5LAwM7Mih4WZmRU5LMzMrMhhYWZmRQ4LMzMrcliYmVmRw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysyGFhZmZFDgszMytyWJiZWZHDwszMihwWZmZW5LAwM7Mih4WZmRU5LMzMrMhhYWZmRQ4LMzMrcliYmVlR08JC0nBJv5b0hKR5kr6c27eSdKukp/PfQZVlTpe0QNJ8SYdW2veRNDdPO0+SmlW3mZmtqZlHFm8DX42I9wP7AydK2g04DZgZEaOAmXmcPG0iMBoYB1wgqU9e14XAZGBUvo1rYt1mZlajaWEREUsj4sE8vBx4AhgKTACm5tmmAkfk4QnA1RGxMiIWAguAfSVtDwyIiFkREcC0yjJmZtYNuuWchaQRwF7AfcC2EbEUUqAA2+TZhgKLKostzm1D83Bte3vbmSxpjqQ5y5Yt69L7YGbWmzU9LCRtCVwPnBwRr9ebtZ22qNO+ZmPERRExJiLGDB48uPPFmplZu5oaFpI2JgXFFRHxk9z8Qu5aIv99MbcvBoZXFh8GLMntw9ppNzOzbtLMT0MJ+BHwRER8rzJpBjApD08Cbqi0T5S0qaSRpBPZs3NX1XJJ++d1HlNZxszMukHfJq77IOBoYK6kh3PbGcB3gOmSjgeeA44EiIh5kqYDj5M+SXViRKzKy50AXAb0A27ONzMz6yZNC4uIuJv2zzcAjO1gmSnAlHba5wC7d111ZmbWGf4Gt5mZFTkszMysyGFhZmZFDgszMytyWJiZWZHDwszMihwWZmZW5LAwM7Mih4WZmRU5LMzMrMhhYWZmRQ4LMzMrcliYmVmRw8LMzIocFmZmVuSwMDOzIoeFmZkVOSzMzKzIYWFmZkUOCzMzK3JYmJlZkcPCzMyKHBZmZlbksDAzsyKHhZmZFTkszMysyGFhZmZFDgszMytyWHRg/PjxjB8/fn2XYWbWEhwWZmZW5LAwM7Mih4WZmRU5LMzMrMhhYWZmRQ4LMzMrclgU+OOzZmYOCzMza4DDwszMinpMWEgaJ2m+pAWSTuvObfvb3GbW2/WIsJDUBzgf+DtgN+AoSbt1dx3V0HCAmFlv0nd9F9CgfYEFEfEMgKSrgQnA4+ujmGpIdDYwbrzxxq4ux8ys6XpKWAwFFlXGFwP71c4kaTIwOY/+UdL8tdze1sBLa7lsXZK6cnVNq7ML9YQawXV2NdfZdbq7xve219hTwqK9/7CxRkPERcBF67wxaU5EjFnX9TRbT6izJ9QIrrOruc6u0yo19ohzFqQjieGV8WHAkvVUi5lZr9NTwuJ+YJSkkZI2ASYCM9ZzTWZmvUaP6IaKiLcl/QvwS6APcElEzGviJte5K6ub9IQ6e0KN4Dq7muvsOi1RoyLW6Po3MzNbTU/phjIzs/XIYWFmZkW9KixKlwxRcl6e/qikvRtdthXqlDRc0q8lPSFpnqQvt2Kdlel9JD0k6aZWrVPSQEnXSXoy79cDWrDGr+TH+zFJV0narBk1NljnrpJmSVop6dTOLNsKdbbga6jD/Zmnd8trCICI6BU30onx3wI7ApsAjwC71cxzGHAz6Xsd+wP3Nbpsi9S5PbB3Hu4PPNWKdVamnwJcCdzUio97njYV+EIe3gQY2Eo1kr6wuhDol8enA8eux325DfBBYApwameWbZE6W+011G6dlelNfw213XrTkcU7lwyJiLeAtkuGVE0ApkVyLzBQ0vYNLrve64yIpRHxIEBELAeeIP0zaak6ASQNAw4HLm5Sfetcp6QBwIeBHwFExFsR8Wor1Zin9QX6SeoLbE7zvoNUrDMiXoyI+4E/d3bZVqiz1V5DdfZnd76GgN7VDdXeJUNqnwQdzdPIsl1lXep8h6QRwF7AfV1fYmM1FOY5F/g34C9Nqq+RGkrz7AgsAy7Nh/oXS9qilWqMiN8D3wWeA5YCr0XELU2osdE6m7FsZ3XJtlrkNVTPuXTPawjoXWHRyCVDOpqnocuNdJF1qTNNlLYErgdOjojXu7C2hmuoN4+kjwMvRsQDXV/WGtZlf/YF9gYujIi9gBVAM/ra12VfDiK9Gx0JDAG2kPS5Lq6vbg3dsGxnrfO2Wug11P6C3fsaAnpXWDRyyZCO5unOy42sS51I2pj0JL8iIn7SpBrXtc6DgE9IepZ06P0RSZe3YJ2LgcUR0fbO8jpSeLRSjYcACyNiWUT8GfgJcGATamy0zmYs21nrtK0Wew11pDtfQ0mzT4q0yo30LvEZ0juwtpNJo2vmOZzVTyLObnTZFqlTwDTg3FbenzXzHExzT3CvU53AXcAuefgs4L9aqUbS1Zfnkc5ViHRC/kvra19W5j2L1U8ct9RrqE6dLfUa6qjOmmlNfQ29s51mb6CVbqRPlDxF+gTC13LbF4EvVp4o5+fpc4Ex9ZZttTqBD5EOYx8FHs63w1qtzpp1NP2Jvo6P+57AnLxPfwYMasEavwk8CTwG/BjYdD3uy+1I75hfB17NwwM6WrbV6mzB11CH+7Oyjqa/hiLCl/swM7Oy3nTOwszM1pLDwszMihwWZmZW5LAwM7Mih4WZmRU5LGyDI+kcSSdXxn8p6eLK+NmSTlnLdR/c3hU+c/trkh7Ot1+tVfFmLcphYRui35C/xSxpI2BrYHRl+oHAPY2sSFKfTmz3rojYM98OqVlPj/gJY7OOOCxsQ3QP717yYjTpy2rLJQ2StCnwfuAhSWPzBQLnSrokT0PSs5K+Ielu4Mj8mwNP5vFPNlqEpGMlXSvpRuAWSVvk7dyftzshz9dP0tX5dyqukXSfpDF52h8r6/uUpMvy8GBJ1+d13S/poNx+Vt7G7ZKekXRSZflj8jYekfRjSf0lLcyXt0DSgHzfN1673W4bMr/bsQ1ORCyR9LakHUihMYt0Nc8DgNdI387dCLgMGBsRT0maBpxAupInwJsR8SGlHxJ6GvgIsAC4ps6m/5ekh/PwtcDv8zb3iIiXJX0buC0iPi9pIDA7d1f9E/CniNhD0h7Agw3cze8D50TE3fl+/pIUggC7An9L+j2G+ZIuBHYGvgYcFBEvSdoqIpZLup10KZGfAROB6yNdY8psNT6ysA1V29FFW1jMqoz/BtiFdAG+p/L8U0m/XdGmLRR2zfM9HelyB/Uu1lbthpqS226NiJfz8MeA03Kg3A5sBuyQt3s5QEQ8SgqzkkOAH+R1zQAGSOqfp/08IlZGxEvAi8C2pLC7LrdRqeli4Lg8fBxwaQPbtl7IRxa2oWo7b/EBUjfUIuCrpGvsXEL7l4euWlEZXpdr4lTXI+AfImJ+dQZJ9bZRba/+XOpGwAER8UY761pZaVpFep2rvW1ExD2SRkj6G6BPRDxW995Yr+UjC9tQ3QN8HHg5Ilbld9IDSd1Cs0gX3hshaac8/9HAHe2s50lgpKT35fGj1qGmXwJfUv6PLmmv3H4n8NnctjuwR2WZFyS9P5+o//tK+y3Av7SNSNqzsO2ZwKclvSfPv1Vl2jTgKnxUYXU4LGxDNZf0Kah7a9pei4iXIuJNUrfLtZLmkn5t7Ie1K8nzTQZ+nk9w/24davoWsDHwqKTH8jjAhcCWkh4l/fLZ7MoypwE3AbeRfgmvzUnAmHzC+nHSlUo7FBHzSL/jfIekR4DvVSZfAQwiBYZZu3zVWbMWk086nxoRc7ppe58CJkTE0d2xPeuZfM7CrBeT9H+BvyP9roJZh3xkYWZmRT5nYWZmRQ4LMzMrcliYmVmRw8LMzIocFmZmVvT/AdAcZcGerIK4AAAAAElFTkSuQmCC",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "32914 different words in TrainSet\n"
     ]
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAf0ElEQVR4nO3df5xWZZ3/8ddb8AcpKAr+CDBIaBPMKIms3NbCEuvLon21xm0Tk5ZybavH2q5au2Xfos3dLcpttSV1AbOQbE0sWSNNTUVwLBRRyUlNEBQMRLSgwM/3j3NNHoZ7bu6Za+57Zpj38/E4jzn355zrnM85c8/9mXNdZ84oIjAzM+usvbo7ATMz691cSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJBYl5B0saTvdHcePZmkz0i6oq/nYHseF5I9lKSLJN3UJvZoO7GmOudyoqSXJL1Qmm6s5z57ooj4ckR8pKPtJC0qnbc/SvpD6fW36pWDpIMkXSXpaUlbJP1K0gU1tp0j6Usdyc16r/7dnYDVzR3AhZL6RcQOSYcDewNvbBMbndatmaT+EbG9g/msjYjhddjuHi8iTmmdlzQHWBMR/9R2vTqcv1nA/sDRwGbgNcAxXbj9LuH3TffzFcme616KwjE+vX478DNgVZvYryNiraRXSlooaaOkFkl/07qh1G11naTvSHoeOFvSKEm3p99UFwNDOpqgpLMl3SVplqSNwMWS9pX075KelPSMpG9JGlBq8w+S1klaK+kcSSFpdFp2m6SPtNn+naXXr5W0OB3jKknvLy2bI+k/Jf04HdNSSUeVlo8rtX0mdREdLul3kg4prXecpA2S9q5wvH/q/pM0MuU+LR3rs5I+24lzGJLOk/Qo8GiKfUPSaknPS7pP0p93Moc3Ad+NiE0R8VJEPBIR1+3ufEqaAXwQ+MdqV59p35+Q9Fja979J2qu0/BxJD0vaJOlmSa+qdtwVtn+CpLslPZfOx9kp/l5Jv0znZ7Wki0tt9kvv89+mdvdKOiwtO1DSlen995SkL0nqV9t3as/mQrKHiog/AEspigXp68+BO9vEWq9GvgesAV4JnA58WdKk0ianAtcBBwHXAN8F7qMoIF8EpnUy1TcDjwGHAjOBSyh+8x1PcbU0DPgcgKTJwKeBdwFjgJNq3Ymk/YHFKe9DgTOByySNK612JvAFYDDQkvJB0kDgp8D/Upyf0cAtEfE0cBvw/tI2/hqYHxF/rDG1E4A/AyYBn5N0dK3HVHIqxXkcm17fS3H+DqY43u9L2q8TOdwDzJT0YUljyg2qnc+ImE3xHvnXiDggIqZU2fdpwATgjRTvsXPS9k8FPgO8DxhK8d793m6Ou5zfkcAi4D9S+/HA8rT4ReAsivfye4Fz0/6geB8fCIwADgE+Bvw+LZsLbKf4/r8BeDfQ4a7KPVJEeNpDJ+Bi4Po0fz/Fh+/kNrFpFD80O4CBpbb/AswpbeeO0rIjKX6g9i/Fvgt8p508TgReAp4rTe8HzgaeLK0nih/yo0qxtwCPp/mrgK+Ulr0GCGB0en0b8JHS8rOBO9P8B4Cft8nrv4DPp/k5wBWlZe8BHknzZwK/bOfYPgDcleb7AU8DE6t8P76T5kem3IeXli8DmnbzPZ0DfKn0OoB37qbNJuD1Hc0BGEDxYX4f8EeK4npKB87nl3aTVwCTS6//lqJAQ1EEppeW7QX8DnhVLccNXER6n9fwc/J1YFaaPwe4Gzi2zTqHAduAAaXYmcDPOvIzuadOviLZs90BnCBpMDA0Ih6l+CF5a4odk9Z5JbAxIraU2v6G4mqg1erS/CuBTRHxYpv1q1kbEQeVpgUVtjsUeAVwX+pWeI7iKmBoab/l9Xe3z7JXAW9u3W7a9geBw0vrPF2a/x1wQJofAfy6ne3eAIyV9GqKK6XNEbGsA3m1t8+OKJ8TJJ2fuoQ2p+M8kOpdjxVziIjfRzE4fxzFb+cLKK5uDqa289nR3H9D8T0mbf8bpW1vpPhFo733ZFvtfs8kvVnSz1IX5GaKq47W83M1cDMwX0X36b+mbspXUXQVryvl9F8UV2N9ngfb92xLKD5EZgB3AUTE85LWptjaiHhc0nbgYEkDS8XkSOCp0rbKj4leBwyWtH+pmBzZZp1alds8S9GNMC4inqqw7jqKD4hWR7ZZ/iJFIWpV/lBbDdweEe/qRI6rKX773EVEbJW0gOJD9LUUH0SN9qdzmMZDLqDoploZES9J2kTxIdz5HRTvmy9T/KY/it2fz1rfCyOAlWn+SGBtml8NzIyIa6qlVWXZamBiO8u+C3yT4upqq6SvkwpJFF2SXwC+IGkkcBPFuOJNFFckQ8ID+7vwFckeLCJ+DzQDf0/Rx9zqzhS7I623muJK5V/SYOOxwHSKfu5K2/1N2u4XJO0j6QSgWj94rfm+BHwbmCXpUABJwySdnFZZQDHQP1bSK4DPt9nEcuB9kl6hYgB+emnZj4DXSPqQpL3T9KYaxyR+BBwu6VMqbgYYKOnNpeXzKLrR/hLo7r+lGUjR7bgB6C/pc8CgzmxI0j+nc7RPGmP5JEW35Cp2fz6fAV5dw27+QdJgSSPS9q9N8W8BF7WOYaWB7jM6kP41wEmS3i+pv6RDJI1PywZSXIFvlTQR+KvSMb9D0uvSIPrzFF16OyJiHfAT4KuSBknaS9JRkv6iAzntsVxI9ny3U1x+31mK/TzFyrf9nknRZ74WuJ6ir3txle3+FcVA50aKD/R5XZTvBRR98feouEPspxQDwUTEIor+7FvTOre2aTsL+APFh9hcSoUwXWm9G2iiOManKQb2991dQqntuyiK5dMUdwm9o7T8LooxoF9ExBMdO9wudzPF+MKvKLqKtlK9C6iaAP6b4kpxLcU5eG9EvFDD+bySosvvOUk/rLKPGyjGYJYDP07tiIjr0/bmp/fBg8Ap7Wxj18QjnqQY5zqf4j26HHh9Wvy3wP+TtIXiRo4FpaaHU9xU8jzwMMXPT+svB2cB+wAPUYw7XQccUWtOezKlQSOzXklSAGMioqWb87iV4lZZ/9V4jXrK987yeYzELJOkN/Hy7atmfY67tswySJpL0f32qTZ3vZn1Ge7aMjOzLL4iMTOzLH1ujGTIkCExcuTI7k7DzKxXue+++56NiKGVlvW5QjJy5Eiam5u7Ow0zs15FUrtPknDXlpmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEh6YApU6YwZUr2PwI0M9ujuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZalbIZG0n6Rlku6XtFLSF1L8YEmLJT2avg4utblIUoukVZJOLsWPk7QiLbtUklJ8X0nXpvhSSSPrdTxmZlZZPa9ItgHvjIjXA+OByZKOBy4EbomIMcAt6TWSxgJNwDhgMnCZpH5pW5cDM4AxaZqc4tOBTRExGpgFXFLH4zEzswrqVkii8EJ6uXeaApgKzE3xucCpaX4qMD8itkXE40ALMFHSEcCgiFgSEQHMa9OmdVvXAZNar1bMzKwx6jpGIqmfpOXAemBxRCwFDouIdQDp66Fp9WHA6lLzNSk2LM23je/UJiK2A5uBQyrkMUNSs6TmDRs2dNHRmZkZ1LmQRMSOiBgPDKe4ujimyuqVriSiSrxam7Z5zI6ICRExYejQiv+73szMOqkhd21FxHPAbRRjG8+k7irS1/VptTXAiFKz4cDaFB9eIb5TG0n9gQOBjfU4BjMzq6yed20NlXRQmh8AnAQ8AiwEpqXVpgE3pPmFQFO6E2sUxaD6stT9tUXS8Wn846w2bVq3dTpwaxpHMTOzBulfx20fAcxNd17tBSyIiB9JWgIskDQdeBI4AyAiVkpaADwEbAfOi4gdaVvnAnOAAcCiNAFcCVwtqYXiSqSpjsdjZmYV1K2QRMQDwBsqxH8LTGqnzUxgZoV4M7DL+EpEbCUVIjMz6x7+y3YzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVmWuhUSSSMk/UzSw5JWSvpkil8s6SlJy9P0nlKbiyS1SFol6eRS/DhJK9KySyUpxfeVdG2KL5U0sl7HY2ZmldXzimQ7cH5EHA0cD5wnaWxaNisixqfpJoC0rAkYB0wGLpPUL61/OTADGJOmySk+HdgUEaOBWcAldTweMzOroG6FJCLWRcQv0vwW4GFgWJUmU4H5EbEtIh4HWoCJko4ABkXEkogIYB5waqnN3DR/HTCp9WrFzMwaoyFjJKnL6Q3A0hT6uKQHJF0laXCKDQNWl5qtSbFhab5tfKc2EbEd2AwcUmH/MyQ1S2resGFD1xyUmZkBDSgkkg4AfgB8KiKep+imOgoYD6wDvtq6aoXmUSVerc3OgYjZETEhIiYMHTq0YwdgZmZV1bWQSNqboohcExH/AxARz0TEjoh4Cfg2MDGtvgYYUWo+HFib4sMrxHdqI6k/cCCwsT5HY2ZmldTzri0BVwIPR8TXSvEjSqudBjyY5hcCTelOrFEUg+rLImIdsEXS8WmbZwE3lNpMS/OnA7emcRQzM2uQ/nXc9tuADwErJC1Psc8AZ0oaT9EF9QTwUYCIWClpAfAQxR1f50XEjtTuXGAOMABYlCYoCtXVkloorkSa6ng8ZmZWQd0KSUTcSeUxjJuqtJkJzKwQbwaOqRDfCpyRkaaZmWXyX7abmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyy1K2QSBoh6WeSHpa0UtInU/xgSYslPZq+Di61uUhSi6RVkk4uxY+TtCItu1SSUnxfSdem+FJJI+t1PGZmVlk9r0i2A+dHxNHA8cB5ksYCFwK3RMQY4Jb0mrSsCRgHTAYuk9QvbetyYAYwJk2TU3w6sCkiRgOzgEvqeDxmZlZB3QpJRKyLiF+k+S3Aw8AwYCowN602Fzg1zU8F5kfEtoh4HGgBJko6AhgUEUsiIoB5bdq0bus6YFLr1YqZmTVGQ8ZIUpfTG4ClwGERsQ6KYgMcmlYbBqwuNVuTYsPSfNv4Tm0iYjuwGTikwv5nSGqW1Lxhw4YuOiozM4MGFBJJBwA/AD4VEc9XW7VCLKrEq7XZORAxOyImRMSEoUOH7i5lMzPrgLoWEkl7UxSRayLif1L4mdRdRfq6PsXXACNKzYcDa1N8eIX4Tm0k9QcOBDZ2/ZGYmVl76nnXloArgYcj4mulRQuBaWl+GnBDKd6U7sQaRTGovix1f22RdHza5llt2rRu63Tg1jSOYmZmDdK/jtt+G/AhYIWk5Sn2GeArwAJJ04EngTMAImKlpAXAQxR3fJ0XETtSu3OBOcAAYFGaoChUV0tqobgSaarj8ZiZWQV1KyQRcSeVxzAAJrXTZiYws0K8GTimQnwrqRCZmVn38F+2m5lZFhcSMzPL4kJiZmZZXEjMzCyLC4mZmWVxITEzsywuJGZmlsWFxMzMsriQmJlZlpr+sj09++rvgJHlNhHxl/VJy8zMeotaH5HyQ4rnWt0IvFS3bMzMrNeptZBsjYhL65qJmZn1SrUWkm9I+jzwE2Bba7D1X+mamVnfVWsheR3FI+HfyctdW5Fem5lZH1ZrITkNeHVE/KGeyZiZWe9T6+2/9wMH1TEPMzPrpWq9IjkMeETSvew8RuLbf83M+rhaC8nn65qFmZn1WjUVkoi4vd6JmJlZ71TrX7ZvobhLC2AfYG/gxYgYVK/EzMysd6j1imRg+bWkU4GJ9UjIzMx6l049tDEifoj/hsTMzKi9a+t9pZd7ARN4uavLzMz6sFrv2ppSmt8OPAFM7fJszMys16mpaysiPlya/iYiZkbE+mptJF0lab2kB0uxiyU9JWl5mt5TWnaRpBZJqySdXIofJ2lFWnapJKX4vpKuTfGlkkZ2+OjNzCxb1SsSSZ+rsjgi4otVls8BvgnMaxOfFRH/3mY/Y4EmYBzwSuCnkl4TETuAy4EZwD3ATcBkYBEwHdgUEaMlNQGXAB+odjxmZtb1dndF8mKFCYoP8QuqNYyIO4CNNeYxFZgfEdsi4nGgBZgo6QhgUEQsiYigKEqnltrMTfPXAZNar1bMzKxxqhaSiPhq6wTMBgYAHwbmA6/u5D4/LumB1PU1OMWGAatL66xJsWFpvm18pzYRsR3YDBxSaYeSZkhqltS8YcOGTqZtZmaV7HaMRNLBkr4EPEDRFfbGiLhgd2Mk7bgcOAoYD6wDvtq6mwrrRpV4tTa7BiNmR8SEiJgwdOjQDiVsZmbVVS0kkv4NuBfYArwuIi6OiE2d3VlEPBMROyLiJeDbvPxHjWuAEaVVhwNrU3x4hfhObST1Bw6k9q40MzPrIru7IjmfYvD7n4C1kp5P0xZJz3d0Z2nMo9VpQOsdXQuBpnQn1ihgDLAsItYBWyQdn8Y/zgJuKLWZluZPB25N4yhmZtZAVe/aiohO/eU7gKTvAScCQyStoXiC8ImSxlN0QT0BfDTtZ6WkBcBDFH+ncl66YwvgXIo7wAZQ3K21KMWvBK6W1EJxJdLU2VzNzKzzav2DxA6LiDMrhK+ssv5MYGaFeDNwTIX4VuCMnBzNzCxfp684zMzMwIXEzMwyuZCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLHUrJJKukrRe0oOl2MGSFkt6NH0dXFp2kaQWSasknVyKHydpRVp2qSSl+L6Srk3xpZJG1utYzMysffW8IpkDTG4TuxC4JSLGALek10gaCzQB41KbyyT1S20uB2YAY9LUus3pwKaIGA3MAi6p25GYmVm76lZIIuIOYGOb8FRgbpqfC5xais+PiG0R8TjQAkyUdAQwKCKWREQA89q0ad3WdcCk1qsVMzNrnEaPkRwWEesA0tdDU3wYsLq03poUG5bm28Z3ahMR24HNwCGVdipphqRmSc0bNmzookMxMzPoOYPtla4kokq8WptdgxGzI2JCREwYOnRoJ1M0M7NKGl1InkndVaSv61N8DTCitN5wYG2KD68Q36mNpP7AgezalWZmZnXW6EKyEJiW5qcBN5TiTelOrFEUg+rLUvfXFknHp/GPs9q0ad3W6cCtaRzFzMwaqH+9Nizpe8CJwBBJa4DPA18BFkiaDjwJnAEQESslLQAeArYD50XEjrSpcynuABsALEoTwJXA1ZJaKK5Emup1LGZm1r66FZKIOLOdRZPaWX8mMLNCvBk4pkJ8K6kQmZlZ9+kpg+1mZtZLuZCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWpVsKiaQnJK2QtFxSc4odLGmxpEfT18Gl9S+S1CJplaSTS/Hj0nZaJF0qSd1xPGZmfVl3XpG8IyLGR8SE9PpC4JaIGAPckl4jaSzQBIwDJgOXSeqX2lwOzADGpGlyA/M3MzN6VtfWVGBump8LnFqKz4+IbRHxONACTJR0BDAoIpZERADzSm3MzKxBuquQBPATSfdJmpFih0XEOoD09dAUHwasLrVdk2LD0nzb+C4kzZDULKl5w4YNXXgYZmbWv5v2+7aIWCvpUGCxpEeqrFtp3COqxHcNRswGZgNMmDCh4jpmZtY53XJFEhFr09f1wPXAROCZ1F1F+ro+rb4GGFFqPhxYm+LDK8TNzKyBGl5IJO0vaWDrPPBu4EFgITAtrTYNuCHNLwSaJO0raRTFoPqy1P21RdLx6W6ts0ptzMysQbqja+sw4Pp0p25/4LsR8b+S7gUWSJoOPAmcARARKyUtAB4CtgPnRcSOtK1zgTnAAGBRmszMrIEaXkgi4jHg9RXivwUmtdNmJjCzQrwZOKarczQzs9r1pNt/zcysF3IhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZXEhMTOzLC4kZmaWxYXEzMyyuJCYmVkWFxIzM8viQmJmZllcSDphypQp3Z2CmVmP0esLiaTJklZJapF0YXfnY2bW1/TqQiKpH/CfwCnAWOBMSWMbse8pU6b4ysTMDOjf3Qlkmgi0RMRjAJLmA1OBhxqVQHvF5MYbb2xUCmZm3aq3F5JhwOrS6zXAm9uuJGkGMCO9fEHSqk7ubwjwbC0rSurkLrpMzbl2M+fZtZxn1+studY7z1e1t6C3F5JKn9axSyBiNjA7e2dSc0RMyN1OI/SWXJ1n13KeXa+35NqdefbqMRKKK5ARpdfDgbXdlIuZWZ/U2wvJvcAYSaMk7QM0AQu7OSczsz6lV3dtRcR2SR8Hbgb6AVdFxMo67jK7e6yBekuuzrNrOc+u11ty7bY8FbHLkIKZmVnNenvXlpmZdTMXEjMzy+JCkuzuUSsqXJqWPyDpjbW27UF5PiFphaTlkpq7Oc/XSloiaZukT3ekbQ/Ks2Hns8ZcP5i+5w9IulvS62tt24Py7Env0akpx+WSmiWdUGvbHpRnY85nRPT5iWKg/tfAq4F9gPuBsW3WeQ+wiOJvV44HltbatifkmZY9AQzpIefzUOBNwEzg0x1p2xPybOT57ECubwUGp/lTevB7tGKePfA9egAvjyMfCzzSQ89nxTwbeT59RVL406NWIuIPQOujVsqmAvOicA9wkKQjamzbE/JspN3mGRHrI+Je4I8dbdtD8my0WnK9OyI2pZf3UPxdVU1te0iejVRLni9E+jQG9uflP3buaeezvTwbxoWkUOlRK8NqXKeWtl0lJ08o3mA/kXRfemxMveSck552Pqtp1PmEjuc6neLKtDNtc+TkCT3sPSrpNEmPAD8GzulI2x6QJzTofPbqvyPpQrU8aqW9dWp6TEsXyckT4G0RsVbSocBiSY9ExB1dmuHuc6hn247K3Vejzid0IFdJ76D4gG7tK++R57RCntDD3qMRcT1wvaS3A18ETqq1bRfJyRMadD59RVKo5VEr7a3TyMe05ORJRLR+XQ9cT3HZ3F151qNtR2Xtq4HnE2rMVdKxwBXA1Ij4bUfa9oA8e+x7NH34HiVpSEfbZsrJs3Hns96DML1horgyewwYxcsDWuParPNedh7EXlZr2x6S5/7AwNL83cDk7sqztO7F7DzY3qPOZ5U8G3Y+O/C9PxJoAd7a2ePs5jx71HsUGM3Lg9hvBJ5KP1c97Xy2l2fjzme93vi9baK42+lXFHdIfDbFPgZ8LM2L4p9o/RpYAUyo1ran5Ulx18f9aVrZA/I8nOK3reeB59L8oB54Pivm2ejzWWOuVwCbgOVpau6h79GKefbA9+gFKY/lwBLghB56Pivm2cjz6UekmJlZFo+RmJlZFhcSMzPL4kJiZmZZXEjMzCyLC4mZmWVxIbE+Q9IsSZ8qvb5Z0hWl11+V9Ped3PaJkn7UTnxzevrqckk/7VTyZj2YC4n1JXdTPHkWSXsBQ4BxpeVvBe6qZUOS+nVgvz+PiPFpOqm8QJIfU2S9nguJ9SV3kQoJRQF5ENgiabCkfYGjgV9KmiTpl+n/OFyVlrX+b4fPSboTOCP9n4hH0uv31ZqEpLMlfV/SjRQP1Ns/7efetN+pab0Bkuan/zVxraSlkiakZS+Utne6pDlpfqikH6Rt3SvpbSl+cdrHbZIek/SJUvuz0j7ul3S1pIGSHpe0d1o+KB373p077ban829D1mdE8fC67ZKOpCgoSyiepPoWYDPwAMUvV3OASRHxK0nzgHOBr6fNbI2IEyTtBzwKvJPicR/XVtn1n0tanua/T/EIi7cAx0bERklfBm6NiHMkHQQsS11gHwV+FxHHpmdT/aKGw/wGMCsi7kzHeTNFgQR4LfAOYCCwStLlwGuAz1I83O9ZSQdHxBZJt1E8bueHQBPwg4jo7kfpWw/lKxLra1qvSloLyZLS67uBPwMej4hfpfXnAm8vtW8tGK9N6z0axeMhvlNln+WurZkptjgiNqb5dwMXpmJzG7AfxfOo3t663Yh4gKLQ7c5JwDfTthYCgyQNTMt+HBHbIuJZYD1wGEUhvC7FKOV0BfDhNP9h4L9r2Lf1Ub4isb6mdZzkdRRdW6uB8ymepXUVlR/bXfZiaT7n+ULl7Qj4vxGxqryCpGr7KMf3K83vBbwlIn5fYVvbSqEdFD//qrSPiLhL0khJfwH0i4gHqx6N9Wm+IrG+5i7g/wAbI2JH+g38IIqupiXAI8BISaPT+h8Cbq+wnUeAUZKOSq/PzMjpZuDvlD7tJb0hxe8APphix1D8G9VWz0g6Ot00cFop/hPg460vJI3fzb5vAd4v6ZC0/sGlZfOA7+GrEdsNFxLra1ZQ3K11T5vY5oh4NiK2UnTlfF/SCuAl4FttN5LWmwH8OA22/yYjpy8CewMPSHowvQa4HDhA0gPAPwLLSm0uBH4E3AqsK8U/AUxIg+cPUTwltl0RsZLi/9HfLul+4GulxdcAgymKiVm7/PRfs14iDYB/OiKaG7S/0yn+8dSHGrE/6708RmJmu5D0H8ApFP8Lw6wqX5GYmVkWj5GYmVkWFxIzM8viQmJmZllcSMzMLIsLiZmZZfn/xeVbNwvkkHQAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "df_test_fre = visualize(df_test, dataset='TestSet')\n",
    "df_valid_fre = visualize(df_valid, dataset='ValidSet')\n",
    "df_train_fre = visualize(df_train, dataset='TrainSet')"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 60,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "                      word  count  frequency\n",
      "0                    arrow   1513   0.141972\n",
      "1                    right   1296   0.121610\n",
      "2                       ct   1282   0.120297\n",
      "3                    image   1185   0.111195\n",
      "4                     left   1016   0.095336\n",
      "...                    ...    ...        ...\n",
      "10652               zstack      1   0.000094\n",
      "10653                zubal      1   0.000094\n",
      "10654                   zy      1   0.000094\n",
      "10655               zygoma      1   0.000094\n",
      "10656  zygomaticomaxillary      1   0.000094\n",
      "\n",
      "[10657 rows x 3 columns]\n"
     ]
    }
   ],
   "source": [
    "print(df_test_fre)\n",
    "df_test_fre.to_csv('/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/df_test_fre.csv', sep=',', index=False)\n",
    "df_valid_fre.to_csv('/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/df_valid_fre.csv', sep=',', index=False)\n",
    "df_train_fre.to_csv('/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/df_train_fre.csv', sep=',', index=False)\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 62,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "1e-4: 8.726700000000001\n",
      "1e-3: 87.267\n"
     ]
    }
   ],
   "source": [
    "# 词频低于 1e-4 的占比\n",
    "total_num_word = sum(wordcount)\n",
    "print(f\"1e-4: {1e-4 * total_num_word}\")\n",
    "print(f\"1e-3: {1e-3 * total_num_word}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 61,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZIAAAEWCAYAAABMoxE0AAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAfW0lEQVR4nO3df5xVdb3v8ddbUMQfKMr4C1BI6QeQlRKpdbomnqQfhJZ2sFIyTty8VnZOP9TqpF2jm6fSspN2vVpAeSRSUzQtCY+/ERzNRFSSxIRAwTBFTRT93D/Wdx+Ww56ZPfNl9t7TvJ+Px37MWt/1XWt99h7Y773Wd+01igjMzMy6a5tGF2BmZr2bg8TMzLI4SMzMLIuDxMzMsjhIzMwsi4PEzMyyOEis6Ug6S9LPGl1HbybpZElPSHpW0u6Nrsf+vjlIrFOSzpB0XZu2h9tpm9LDtRwu6ZX0Bll5XNOT++xtJG0LnAu8OyJ2ioi/tFk+QlJIuqdN+xBJL0p6tNT2qKQj0/TH03rntlnv6NQ+s832+5f6jJd0naS/SlovabGkkzp4DntLukTSGkkbJD0k6euSdpQ0TNJTkt5R6j88tb0tzd8k6YX07+NJSVdK2rvrr6bVwkFitbgFeLukfgCS9gK2BQ5q03ZA6luz8ptNF6xOb5CVx6SttN2/F3sC2wNLO+m3o6SxpfmPACs6WeePwD+1eX1PBP7Q3gqSDgVuBG6m+DeyO3Ay8J52+u8GLAQGAodGxM7APwK7AvtHxCrgNOBiSdun1f4v8JOIWFTa1KcjYqe0z52A73Ty3KybHCRWi7soguPNaf6dwH8By9q0/TEiVkvaR9K89MlzuaRPVjaUTltdLulnkp4BPi5ppKSb0yfP+cCQrhaYPi3fLuk8SeuBsyQNkPQdSY+l0zw/kjSwtM4X0yfe1ZI+kT5FH5CW3STpn9ts/7bS/OslzU/PcZmkD5eWzZT0Q0m/Ss9pkaT9S8vHlNZ9QtKXJe0l6fnyaShJB0tal44w2j7fAZK+l2pfnaYHSHpt+r0A/FXSjR28bD8FppbmTwRmd/JSPw4sAY5KdewGHAbM62CdbwOzIuKciHgyCndHxIfb6f+vwAbgYxHxKEBErIyIUyPivtTn/wFrgDMlTQVeB3y12sYi4q/AVWz+t2pbmYPEOhURLwKLKMKC9PNW4LY2bZWjkcuAVcA+wLHANyVNKG1yMnA5xSfMS4H/BO6mCJCzefWbW1e8DXgE2AOYAZwDvJbiDeQAYCjwNQBJE4EvUHzSHQUcWetOJO0IzE917wEcD1wgaUyp2/HA14HBwPJUD5J2Bn4L/Jri9TkAWBARjwM3AeU3148BcyLipSplfAU4JD23NwHjga9GxB+ASh27RsQRHTyVnwFTJPWT9AZgZ4rfc2dmU4QOwBTgamBjtY6SdgAOpfh91+pI4MqIeKW9DlHc2+mfgf8FfA/4ZEQ8304NuwMfpPg9WA9wkFitbmZzaPwDRZDc2qbtZknDgXcAp0XECxFxL3AxcEJpWwsj4qr0RtECvBX4t4jYGBG3AJ2NeeyTzrVXHpU339UR8YOI2AS8AHwS+JeIWB8RG4BvUrzxQfGG/ZOIuD8ingPO6sJr8X7g0Yj4SURsioh7gCsoQrPiyohYnGq5lM2fht8PPB4R302vz4bS6ZhZFOFBOmV4PMVRQzUfBf53RKyNiHUUoXVCO33bs4ri6OVIivDu7Gik4pfA4ZJ2ofOjmMEU7zNrulDX7jX2/xOwGniG6qdUz5f0NPAkxYeUz3ShBusCB4nV6hbgHZIGAy0R8TBwB3BYahub+uwDVN64K/5EcTRQsbI0vQ/wVHozL/fvyOqI2LX0mFtluy3ADsDdlcChOApoKe233L+zfZbtB7ytHGYUb+x7lfo8Xpp+nuIcPcBwinGGaq4GRkt6DcWR0tMRsbidvvu0qflPqa2rZgMfpwitmq6Ui4i/Ab+iOJU0JCJu76D7U8ArQFcGuv9SY//TU9+1FEeXbX02InYBDqQItGFdqMG6wEFitVoI7AJMB24HiIhnKD4RTqd4c1+R5ndLp3Aq9gX+XJov33J6DTA4nS4q9++O8nafBP4GjCkFzi5p8LWy3+Ed7PM5iiCqKIfESuDmNmG2U0ScXEONK4H9qy2IiBeAuRShdALtH41A8Trv16b+1TXsv60rgPcBj0REV8J0NvB5Oq6RdLppIfChLmz7t8Axktp9f5I0GvgixemtacCXJY1qp4YlwDeAH0pSF+qwGjlIrCbpU2grxUDoraVFt6W2W1K/lRRHKv9H0vaSDqT4j35pO9v9U9ru1yVtly7p3OIqrG7U+wrFgOx5kvYAkDRU0lGpy1yKgf7R6Tz+mW02cS/wQUk7pAH4aaVl1wKvlXSCpG3T461pnKEz1wJ7SfpcGhzfWemS1aRyhPABOj5CuAz4qqQWSUMoxn66/N2bdCR4BMUbclfcTHHU9IMa+n6J4rX+YuViAklvkjSnnf7nAoOAWZL2S/2HSjpX0oEpYC4B/j0iHkoD8OcDF3UQFLMoxrM+UOsTtNo5SKwrbqb4z3hbqe3W1FY+R308MILiE/IvgTMjYn4H2/0IxUD5eoo39FrP1XfmNIoB1jtVXCH2W4qre4iI6ykGaW9Mfdpe3XQe8CLwBMWb0H8HYTpt926K8ZbVFKexzgEGdFZQWvcfKcLyceBh4F2l5bdTnAq6p3LFUju+QRHA91FcRXVPauuyiGiNiPZOt7W3TkTEgohYX0PfOyjC6gjgkXRV3UXAde30X09xJdhLwCJJG4AFwNMUv6tTKY4W/7202tkUR41VAzFdMHI+8G81PUHrEvkPW5kVJAUwKiIaenVPumT3PyPi4kbWYVarvvylLbOmI+mtwEEUl0ib9Qo+tWXWJCTNojj99rk2V72ZNTWf2jIzsyw+IjEzsyx9boxkyJAhMWLEiEaXYWbWq9x9991PRkRLtWV9LkhGjBhBa2tro8swM+tVJLX7hVWf2jIzsywOEjMzy+IgMTOzLA4SMzPL4iAxM7MsDhIzM8viIDEzsywOEjMzy+IgMTOzLA6SLpg0aRKTJmX/8T4zs78rDhIzM8viIDEzsywOEjMzy+IgMTOzLA4SMzPL4iAxM7MsPRYkkn4saa2k+0tt35b0kKT7JP1S0q6lZWdIWi5pmaSjSu0HS1qSlp0vSal9gKSfp/ZFkkb01HMxM7P29eQRyUxgYpu2+cDYiDgQ+ANwBoCk0cAUYExa5wJJ/dI6FwLTgVHpUdnmNOCpiDgAOA84p8eeiZmZtavHgiQibgHWt2m7ISI2pdk7gWFpejIwJyI2RsQKYDkwXtLewKCIWBgRAcwGji6tMytNXw5MqBytmJlZ/TRyjOQTwPVpeiiwsrRsVWobmqbbtr9qnRROTwO7V9uRpOmSWiW1rlu3bqs9ATMza1CQSPoKsAm4tNJUpVt00N7ROls2RlwUEeMiYlxLS0tXyzUzsw7UPUgkTQXeD3w0na6C4khjeKnbMGB1ah9Wpf1V60jqD+xCm1NpZmbW8+oaJJImAqcBH4iI50uL5gFT0pVYIykG1RdHxBpgg6RD0vjHicDVpXWmpuljgRtLwWRmZnXSv6c2LOky4HBgiKRVwJkUV2kNAOancfE7I+JTEbFU0lzgAYpTXqdExMtpUydTXAE2kGJMpTKucgnwU0nLKY5EpvTUczEzs/b1WJBExPFVmi/poP8MYEaV9lZgbJX2F4Djcmo0M7N8/ma7mZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZlh4LEkk/lrRW0v2ltt0kzZf0cPo5uLTsDEnLJS2TdFSp/WBJS9Ky8yUptQ+Q9PPUvkjSiJ56LmZm1r6ePCKZCUxs03Y6sCAiRgEL0jySRgNTgDFpnQsk9UvrXAhMB0alR2Wb04CnIuIA4DzgnB57JmZm1q4eC5KIuAVY36Z5MjArTc8Cji61z4mIjRGxAlgOjJe0NzAoIhZGRACz26xT2dblwITK0YqZmdVPvcdI9oyINQDp5x6pfSiwstRvVWobmqbbtr9qnYjYBDwN7F5tp5KmS2qV1Lpu3bqt9FTMzAyaZ7C92pFEdNDe0TpbNkZcFBHjImJcS0tLN0s0M7Nq6h0kT6TTVaSfa1P7KmB4qd8wYHVqH1al/VXrSOoP7MKWp9LMzKyH1TtI5gFT0/RU4OpS+5R0JdZIikH1xen01wZJh6TxjxPbrFPZ1rHAjWkcxczM6qh/T21Y0mXA4cAQSauAM4FvAXMlTQMeA44DiIilkuYCDwCbgFMi4uW0qZMprgAbCFyfHgCXAD+VtJziSGRKTz0XMzNrX48FSUQc386iCe30nwHMqNLeCoyt0v4CKYjMzKxxmmWw3czMeikHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVmWhgSJpH+RtFTS/ZIuk7S9pN0kzZf0cPo5uNT/DEnLJS2TdFSp/WBJS9Ky8yWpEc/HzKwvq3uQSBoKfBYYFxFjgX7AFOB0YEFEjAIWpHkkjU7LxwATgQsk9UubuxCYDoxKj4l1fCpmZkbjTm31BwZK6g/sAKwGJgOz0vJZwNFpejIwJyI2RsQKYDkwXtLewKCIWBgRAcwurWNmZnVS9yCJiD8D3wEeA9YAT0fEDcCeEbEm9VkD7JFWGQqsLG1iVWobmqbbtpuZWR014tTWYIqjjJHAPsCOkj7W0SpV2qKD9mr7nC6pVVLrunXrulqymZl1oBGnto4EVkTEuoh4CbgSOAx4Ip2uIv1cm/qvAoaX1h9GcSpsVZpu276FiLgoIsZFxLiWlpat+mTMzPq6RgTJY8AhknZIV1lNAB4E5gFTU5+pwNVpeh4wRdIASSMpBtUXp9NfGyQdkrZzYmkdMzOrk/713mFELJJ0OXAPsAn4HXARsBMwV9I0irA5LvVfKmku8EDqf0pEvJw2dzIwExgIXJ8eZmZWR3UPEoCIOBM4s03zRoqjk2r9ZwAzqrS3AmO3eoFmZlYzf7PdzMyyOEjMzCyLg8TMzLI4SMzMLIuDxMzMsjhIzMwsi4PEzMyyOEjMzCyLg8TMzLLU9M32dI+rzwAjyutExAd6piwzM+star1FylXAJcA1wCs9Vo2ZmfU6tQbJCxFxfo9WYmZmvVKtQfJ9SWcCN1DcXBGAiLinR6oyM7Neo9YgeSNwAnAEm09tRZo3M7M+rNYgOQZ4TUS82JPFmJlZ71Pr5b+/B3btwTrMzKyXqvWIZE/gIUl38eoxEl/+a2bWx9UaJG3/mqGZmRlQY5BExM09XYiZmfVOtX6zfQPFVVoA2wHbAs9FxKCeKszMzHqHWo9Idi7PSzoaGN8TBZmZWe/SrZs2RsRV+DskZmZG7ae2Plia3QYYx+ZTXWZm1ofVetXWpNL0JuBRYPJWr8bMzHqdWsdITtqaO5W0K3AxMJbiyOYTwDLg5xS3qn8U+HBEPJX6nwFMA14GPhsRv0ntBwMzgYHAdcCpEeEjJTOzOuowSCR9rYPFERFnd3O/3wd+HRHHStoO2AH4MrAgIr4l6XTgdOA0SaOBKcAYYB/gt5JeGxEvAxcC04E7KYJkInB9N2syM7Nu6Gyw/bkqDyiODk7rzg4lDQLeSfH3TYiIFyPirxSnymalbrOAo9P0ZGBORGyMiBXAcmC8pL2BQRGxMB2FzC6tY2ZmddLhEUlEfLcyLWln4FTgJGAO8N321uvEa4B1wE8kvQm4O213z4hYk/a7RtIeqf9QiiOOilWp7aU03bZ9C5KmUxy5sO+++3azbDMzq6bTy38l7SbpG8B9FMFzUEScFhFru7nP/sBBwIUR8RaKo5zTOyqhSlt00L5lY8RFETEuIsa1tLR0tV4zM+tAh0Ei6dvAXcAG4I0RcVZlADzDKmBVRCxK85dTBMsT6XQV6efaUv/hpfWHAatT+7Aq7WZmVkedHZF8nmKA+6vAaknPpMcGSc90Z4cR8TiwUtLrUtME4AFgHjA1tU0Frk7T84ApkgZIGgmMAhan02AbJB0iScCJpXXMzKxOOhsj6dY332vwGeDSdMXWIxTjLtsAcyVNAx4Djks1LJU0lyJsNgGnpCu2AE5m8+W/1+MrtszM6q7WLyRuVRFxL8W349ua0E7/GcCMKu2tFN9FMTOzBumpIw4zM+sjHCRmZpbFQWJmZlkcJGZmlsVBYmZmWRwkZmaWxUFiZmZZHCRmZpbFQWJmZlkcJGZmlsVBYmZmWRwkZmaWxUFiZmZZHCRmZpbFQWJmZlkcJGZmlsVBYmZmWRwkZmaWxUFiZmZZHCRmZpbFQWJmZlkcJGZmlsVBYmZmWRwkZmaWpWFBIqmfpN9JujbN7yZpvqSH08/Bpb5nSFouaZmko0rtB0takpadL0mNeC5mZn1ZI49ITgUeLM2fDiyIiFHAgjSPpNHAFGAMMBG4QFK/tM6FwHRgVHpMrE/pZmZW0ZAgkTQMeB9wcal5MjArTc8Cji61z4mIjRGxAlgOjJe0NzAoIhZGRACzS+uYmVmdNOqI5HvAl4BXSm17RsQagPRzj9Q+FFhZ6rcqtQ1N023btyBpuqRWSa3r1q3bKk/AzMwKdQ8SSe8H1kbE3bWuUqUtOmjfsjHioogYFxHjWlpaatytmZnVon8D9vl24AOS3gtsDwyS9DPgCUl7R8SadNpqbeq/ChheWn8YsDq1D6vSbmZmdVT3I5KIOCMihkXECIpB9Bsj4mPAPGBq6jYVuDpNzwOmSBogaSTFoPridPprg6RD0tVaJ5bWMTOzOmnEEUl7vgXMlTQNeAw4DiAilkqaCzwAbAJOiYiX0zonAzOBgcD16WFmZnXU0CCJiJuAm9L0X4AJ7fSbAcyo0t4KjO25Cs3MrDP+ZruZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVkWB4mZmWVxkJiZWRYHiZmZZXGQmJlZFgeJmZllcZCYmVmWugeJpOGS/kvSg5KWSjo1te8mab6kh9PPwaV1zpC0XNIySUeV2g+WtCQtO1+S6v18zMz6ukYckWwCPh8RbwAOAU6RNBo4HVgQEaOABWmetGwKMAaYCFwgqV/a1oXAdGBUekys5xMxM7MGBElErImIe9L0BuBBYCgwGZiVus0Cjk7Tk4E5EbExIlYAy4HxkvYGBkXEwogIYHZpHTMzq5OGjpFIGgG8BVgE7BkRa6AIG2CP1G0osLK02qrUNjRNt22vtp/pklolta5bt26rPgczs76uYUEiaSfgCuBzEfFMR12rtEUH7Vs2RlwUEeMiYlxLS0vXizUzs3Y1JEgkbUsRIpdGxJWp+Yl0uor0c21qXwUML60+DFid2odVaTczszpqxFVbAi4BHoyIc0uL5gFT0/RU4OpS+xRJAySNpBhUX5xOf22QdEja5omldczMrE76N2CfbwdOAJZIuje1fRn4FjBX0jTgMeA4gIhYKmku8ADFFV+nRMTLab2TgZnAQOD69DAzszqqe5BExG1UH98AmNDOOjOAGVXaW4GxW686MzPrKn+z3czMsjhIzMwsi4PEzMyyOEjMzCyLg8TMzLI4SMzMLIuDxMzMsjhIzMwsi4PEzMyyOEjMzCyLg8TMzLI4SMzMLIuDxMzMsjhIzMwsi4PEzMyyOEjMzCyLg8TMzLI4SMzMLIuDxMzMsjhIzMwsi4PEzMyyOEjMzCyLg8TMzLI4SLph0qRJjS7BzKxp9PogkTRR0jJJyyWd3uh6zMz6ml4dJJL6AT8E3gOMBo6XNLoe+540aZKPTMzMgP6NLiDTeGB5RDwCIGkOMBl4oF4F1BIm11xzTR0qMTNrjN4eJEOBlaX5VcDb2naSNB2YnmaflbSsm/sbAjzZ1ZUkdXN3NetWXXXgurquWWtzXV3z91jXfu0t6O1BUu0dOrZoiLgIuCh7Z1JrRIzL3c7W5rq6plnrguatzXV1TV+rq1ePkVAcgQwvzQ8DVjeoFjOzPqm3B8ldwChJIyVtB0wB5jW4JjOzPqVXn9qKiE2SPg38BugH/DgilvbgLrNPj/UQ19U1zVoXNG9trqtr+lRdithiSMHMzKxmvf3UlpmZNZiDxMzMsjhIqujstisqnJ+W3yfpoCap6/WSFkraKOkL9aipxro+ml6n+yTdIelNTVLX5FTTvZJaJb2jGeoq9XurpJclHdsMdUk6XNLT6fW6V9LXmqGuUm33Sloq6eZmqEvSF0uv1f3pd7lbE9S1i6RrJP0+vV4nZe80IvwoPSgG7f8IvAbYDvg9MLpNn/cC11N8j+UQYFGT1LUH8FZgBvCFJnq9DgMGp+n3NNHrtRObxwkPBB5qhrpK/W4ErgOObYa6gMOBa+vx76qLde1KcTeLfdP8Hs1QV5v+k4Abm6Eu4MvAOWm6BVgPbJezXx+RbOm/b7sSES8ClduulE0GZkfhTmBXSXs3uq6IWBsRdwEv9XAtXa3rjoh4Ks3eSfF9n2ao69lI/5uAHanyZdZG1JV8BrgCWFuHmrpSV73VUtdHgCsj4jEo/h80SV1lxwOXNUldAeys4pYbO1EEyaacnTpItlTttitDu9GnEXU1QlfrmkZxNNfTaqpL0jGSHgJ+BXyiGeqSNBQ4BvhRHeqpua7k0HRK5HpJY5qkrtcCgyXdJOluSSc2SV0ASNoBmEjxwaAZ6voP4A0UX95eApwaEa/k7LRXf4+kh9Ry25Wabs2ylTVin7WouS5J76IIknqMRdR6+5xfAr+U9E7gbODIJqjre8BpEfFyHe7TVlFLXfcA+0XEs5LeC1wFjGqCuvoDBwMTgIHAQkl3RsQfGlxXxSTg9ohY34P1VNRS11HAvcARwP7AfEm3RsQz3d2pj0i2VMttVxpxa5ZmvR1MTXVJOhC4GJgcEX9plroqIuIWYH9JQ5qgrnHAHEmPAscCF0g6utF1RcQzEfFsmr4O2LZJXq9VwK8j4rmIeBK4BejpCzq68u9rCvU5rQW11XUSxanAiIjlwArg9Vl77enBn972oPh08wgwks2DVWPa9Hkfrx5sX9wMdZX6nkX9Bttreb32BZYDhzXZ7/EANg+2HwT8uTLfDL/H1H8m9Rlsr+X12qv0eo0HHmuG14viNM2C1HcH4H5gbKPrSv12oRiD2LGnf4ddeL0uBM5K03umf/dDcvbrU1ttRDu3XZH0qbT8RxRX0ryX4s3xeYqEb3hdkvYCWoFBwCuSPkdxxUa3D1m3Rl3A14DdKT5ZA2yKHr4zao11fQg4UdJLwN+Af4r0v6vBddVdjXUdC5wsaRPF6zWlGV6viHhQ0q+B+4BXgIsj4v5G15W6HgPcEBHP9WQ9XazrbGCmpCUUH4ZPi+JIrtt8ixQzM8viMRIzM8viIDEzsywOEjMzy+IgMTOzLA4SMzPL4iCxPkPSeemS6Mr8byRdXJr/rqR/7ea2D5d0bTvt5Tvm/rZbxZs1MQeJ9SV3UNyJGEnbAEOA8v2iDgNur2VDkvp1Yb+3RsSb0+NVt2CR5O9yWa/nILG+5HZSkFAEyP3ABkmDJQ2g+Ib07yRNkPQ7SUsk/TgtQ9Kjkr4m6TbguPR3Hx5K8x+stQhJH5f0C0nXADdI2jHt566038mp30BJc1T8zZSfS1okaVxa9mxpe8dKmpmmWyRdkbZ1l6S3p/az0j5ukvSIpM+W1j8x7eP3kn4qaWdJKyRtm5YPSs992+697Pb3zp+GrM+IiNWSNknalyJQFlLcGfVQ4GmKb0ZvQ3FbkgkR8QdJs4GTKW6kCPBCRLxD0vbAwxQ3vlsO/LyDXf+DpHvT9C8obklxKHBgRKyX9E2Kv1XxCUm7AovTKbD/CTwfEQeme5XdU8PT/D5wXkTclp7nbygCEor7Kb0L2BlYJulCijvnfgV4e0Q8KWm3iNgg6SaKWwFdRXGvqCsiop5/nsB6ER+RWF9TOSqpBMnC0vwdwOuAFbH5zrGzgHeW1q8ExutTv4fTbUJ+1sE+y6e2ZqS2+bH5brDvBk5PYXMTsD3F/cneWdluRNxHEXSdORL4j7StecAgSTunZb+KiI3pdhhrKe6zdARweeUWGaWaLmbzrX9OAn5Sw76tj/IRifU1lXGSN1Kc2loJfB54Bvgx1W/DXVa+Z1LO/YXK2xHwoYhYVu6Q7kvW3j7K7duXprcBDo2Iv1XZ1sZS08sU//9VbR8RcbukEZL+B9Cvp+9dZb2bj0isr7kdeD+wPiJeTp/Ad6U41bQQeAgYIemA1P8EoNrfAH8IGClp/zR/fEZNvwE+o/RuL+ktqf0W4KOpbSzFnwOueELSG9JFA8eU2m8APl2ZkfTmTva9APiwpN1T//LfFJ9NcftzH41Yhxwk1tcsobha6842bU9HxJMR8QLFqZxfpLujvkKVv1SY+k0HfpUG2/+UUdPZwLbAfZLuT/NQ3O57J0n3AV8CFpfWOR24luLvuq8ptX8WGJcGzx8APtXRjiNiKTADuFnS74FzS4svBQZTv7+lYb2U7/5r1kukAfAvRERrnfZ3LMUfIjuhHvuz3stjJGa2BUk/AN5D8Xd3zDrkIxIzM8viMRIzM8viIDEzsywOEjMzy+IgMTOzLA4SMzPL8v8B61LrohOECL0AAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    },
    {
     "data": {
      "image/png": "iVBORw0KGgoAAAANSUhEUgAAAZYAAAEWCAYAAABFSLFOAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjMuMiwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8vihELAAAACXBIWXMAAAsTAAALEwEAmpwYAAAgP0lEQVR4nO3de7QcVZn+8e9Dwk0gkkDAkASCEkVAVMggF51RQYk4MYiiQYSgOBn5ocJ4m6D+BJeTGfACyIyADCKJIDHihYAiYhAQRCBc5B4SCZCYGILhElDAwDt/7N1Q6XT36ZOzz+mcnOezVq+u2rV31Vvd1f121a6uUkRgZmZWygadDsDMzNYvTixmZlaUE4uZmRXlxGJmZkU5sZiZWVFOLGZmVpQTi7VF0kmSLuh0HP2ZpGMkLZP0lKStOh2PWW9xYumnJJ0g6Rd1ZfOblE3q5VjeKumF/IVZe1zam8vsbyRtCJwKvDMiNo+Iv9RNHyMpJN1aV761pOckPVgpe1DSAXn4qNzu1Lp2B+fy8+vmP7hSZy9Jv5D0uKQVkm6S9JEW6zBC0nclLZW0UtJ9kr4iaTNJoyQ9JunNlfqjc9mb8vjVkp7J28ejkn4iaUT3X01b1zmx9F/XAvtJGgQg6RXAhsAedWU75bptq375dMOS/IVZe0woNN/1xbbAJsDdXdTbTNJulfEPAQu7aPNH4IN1r++RwP3NGkjaB7gKuIa0jWwFHAO8q0n9YcANwKbAPhGxBfAOYEvgVRGxGPh34FxJm+Rm3wG+FxE3Vmb1iYjYPC9zc+AbXaxbEbXPhPUNJ5b+62ZSInlDHv9H4DfAvLqyP0bEEknbSZqdf5kukPQvtRnlw1wXS7pA0pPAUZJ2lHRN/mV6JbB1dwPMv6avl3SapBXASZI2lvQNSQ/nw0JnS9q00uZz+RfxEkkfzb+yd8rTrpb0sbr5X1cZ31nSlXkd50n6QGXa+ZK+LenneZ1ulPSqyvRdK22XSfqCpFdI+mv1sJWkPSUtz3sg9eu7saTTc+xL8vDGkl6d3xeAxyVd1eJl+z4wuTJ+JDCji5f6z8CdwIE5jmHAvsDsFm2+DkyPiFMi4tFIbomIDzSp/2lgJfDhiHgQICIWRcRxEXFHrvO/wFLgREmTgdcAX2o0s4h4HPgZL22ra8jv2dn5fVmZt8cdKtO7er/PyntkTwNvazD/YZK+l9+rxyT9LJcPlXRZfp8fy8OjKu2OkvRAjmmhpMMr0z4q6d7c7opqvAOJE0s/FRHPATeSkgf5+bfAdXVltb2Vi4DFwHbA+4H/lLR/ZZYTgYtJv0AvBH4A3EJKKF9l9S+77ngT8ACwDTANOAV4NekLZSdgJPBlAEnjgc+SfgmPBQ5odyGSNgOuzHFvAxwGnClp10q1w4CvAEOBBTkeJG0B/Br4Jen12QmYExF/Bq4Gql+2HwZmRsTfG4TxRWDvvG6vB/YCvhQR9wO1OLaMiLe3WJULgEmSBkl6LbAF6X3uygxSEgKYBFwCPNuooqSXAfuQ3u92HQD8JCJeaFYh0vWhPgb8P+B04F8i4q9NYtgKOIT0PrRyOGn72xq4nbRttvt+f4j0Hm9B+lzU+z7wMtJ7sw1wWi7fAPgesAOwPfA34H8qyz0DeFfea9s3x4Wkg4Ev5PUaTvo8XtTF+q2fIsKPfvoATgJ+mof/QPoyHl9XNhkYDTwPbFFp+1/A+ZX5XFuZtj2wCtisUvYD4IImcbwVeAF4vPL4AHAU8HClnoCnSYdOamX7AAvz8HnAyZVprwYC2CmPXw18rDL9KOC6PPxB4Ld1cX0HODEPnw+cW5l2EHBfHj4MuK3Jun0QuD4PDyLtHezVpO4fgYMq4wcCD+bhMXldBjdp++J0UpI7EDiZlKwOqM0n130QOKD6GpAOUS0DXg78HtgP+I/Ke1yd/8g8vHM3trX5wMfbqDcYuBd4qH5d8/v3V+CJvPzbge1bzOt8UhKvjW9O2o5Ht/l+z2gx7xGkbXZoG+v0BuCxPLwZaft+H7BpXb3LgaMr4xvk9d2hu5/t/v7wHkv/di3wZklDgeERMR/4HbBvLtst19kOWBERKyttHyJ9wdQsqgxvR/ogPV1Xv5UlEbFl5TGrwXyHk34h3qLUYfw4aS9heGW51fpdLbNqB+BNtfnmeR8OvKJS58+V4b+SvqggfVH9scl8LwF2kfRK0p7UExFxU5O629XF/FAu664ZpIRxGGkPpksR8Tfg56RDT1tHxPUtqj9G+lLtTsf5X9qsPzXXfYS091nvUxHxcmB30p7jqAZ1ql7cHiLiKWAF6TVt5/2ubkv1RpM+E4/VT5D0MknfkfRQPjR8LbClpEH5M/FB4OPA0nxodefcdAfgW5V4VpB+TI2sX8b6zomlf7uB9At1CnA9QEQ8CSzJZUsiYmEeH5YP+dRsD/ypMl69zPVSYGje7a/WXxvV+T5KOqywayUBvTxSZ25tuaNbLPNpUmKqqf8SuaYuuW0eEce0EeMi4FWNJkTEM8As0pfWEaTDJ80sIX25VONf0sby6/0YeDfwQER0J7nOAD5D6xiJdHjqBtKv7nb9GnivpKbfGZJ2AT5HOhx2NPAFSWObxHAnaY/q25LUYrkvbg+SNgeGkV7Tdt7vVpduX0T6TGzZYNpnSP1Db4qIIbx0aFk59isi4h2kRHsfqW+pNs9/rYtp04j4XYs41ktOLP1Y/pU6l9Sx+tvKpOty2bW53iLSnsx/SdpE0u6kD/6FTeb7UJ7vVyRtpHQK6Rpnea1FvC+QPoSnSdoGQNJISQfmKrNIJw7skvsBTqybxe3AIfkX5U55HWouA14t6QhJG+bHP+R+iq5cBrxC0vG5s30L5VNks9oexHtovQdxEfAlScMlbU3qO+r2f3/yr+K3k76gu+Ma0l7Vf7dR9/Ok1/pztZMTJL1e0swm9U8FhgDTax3S+b07VdLuOeF8F/haRNwXqUP/DOCcFoljOqlv4z0t4jxI0pslbUTqa7kxb889eb+JiKWkQ1dn5s76DSXVEsgWpB9AjyudCPHidihpW0nvyT+6ngWeIh2eAzgbOKHWzyPp5ZIObSee9Y0TS/93DenDWe2c/G0uq55mfBjpOPsS4KekY9FXtpjvh0gd7ytIH6yuzkxq17+TOmx/nw8z/Jr065CIuJzU6XtVrlN/9tRpwHOkvoTpVBJjPsz3TlLH9RLSYa9TgI27Cii3fQcpef6Z1J/wtsr060mHjm6NfEZUE/9BSsh3kM7SujWXdVtEzI2IZofnmrWJiJgTESvaqPs7UvJ6O/CA0ll75wC/aFJ/Bamj+u/AjZJWAnNI/SULgONIe5NfqzT7KmmvsmGCjHQCyhnA/28R6g9I298KYE/SnmOP3u+KI/L63Ec6dHd8Lj+d1Gf1KKm/6peVNhuQ9miW5Jj+iXSyAhHx0xzDzLxt30WT07fXd4rwjb5s3SUpgLER0dXZQ70dx1XADyLi3E7GMZAo/blzcUQ0PGXZ1l0D+Q9rZm2R9A/AHqRTss2sCz4UZtaCpOmkw3XH151VZ2ZN+FCYmZkV5T0WMzMrasD1sWy99dYxZsyYTodhZtav3HLLLY9GxPCuaw7AxDJmzBjmzp3b6TDMzPoVSW3/WdeHwszMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKyoXkssks6T9IikuyplwyRdKWl+fh5amXaCpAWS5lXuKIikPSXdmaedUbsbXb7T3w9z+Y2SxvTWutRMmDCBCRN6fCNFM7P1Wm/usZwPjK8rmwrMiYixpLvPTYUX75U9Cdg1tzlT0qDc5izS/dvH5kdtnkcDj0XETqQ7C57Sa2tiZmZt67XEEhHXkm7dWTWRdEtZ8vPBlfKZEfFsRCwk3ep0L0kjgCERcUOk6/vPqGtTm9fFwP4t7q1tZmZ9pK/7WLaNiKUA+XmbXD4SWFSptziXjczD9eWrtYmIVaR7b2/VaKGSpkiaK2nu8uXLC62KmZk1sq503jfa04gW5a3arFkYcU5EjIuIccOHt3XVZzMzW0t9nViW5cNb5OdHcvliYHSl3ihgSS4f1aB8tTaSBgMvZ81Db2Zm1sf6OrHMBibn4cnAJZXySflMrx1JnfQ35cNlKyXtnftPjqxrU5vX+4GrwvdZNjPruF670Zeki4C3AltLWgycCJwMzJJ0NPAwcChARNwtaRZwD7AKODYins+zOoZ0htmmwOX5AfBd4PuSFpD2VCb11rqYmVn7ei2xRMRhTSbt36T+NGBag/K5wG4Nyp8hJyYzM1t3rCud92Zmtp5wYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysqI4kFkn/JuluSXdJukjSJpKGSbpS0vz8PLRS/wRJCyTNk3RgpXxPSXfmaWdIUifWx8zMXtLniUXSSOBTwLiI2A0YBEwCpgJzImIsMCePI2mXPH1XYDxwpqRBeXZnAVOAsfkxvg9XxczMGujUobDBwKaSBgMvA5YAE4Hpefp04OA8PBGYGRHPRsRCYAGwl6QRwJCIuCEiAphRaWNmZh3S54klIv4EfAN4GFgKPBERvwK2jYiluc5SYJvcZCSwqDKLxblsZB6uLzczsw7qxKGwoaS9kB2B7YDNJH24VZMGZdGivNEyp0iaK2nu8uXLuxuymZl1QycOhR0ALIyI5RHxd+AnwL7Asnx4i/z8SK6/GBhdaT+KdOhscR6uL19DRJwTEeMiYtzw4cOLroyZma2uE4nlYWBvSS/LZ3HtD9wLzAYm5zqTgUvy8GxgkqSNJe1I6qS/KR8uWylp7zyfIyttzMysQwb39QIj4kZJFwO3AquA24BzgM2BWZKOJiWfQ3P9uyXNAu7J9Y+NiOfz7I4Bzgc2BS7PDzMz66A+TywAEXEicGJd8bOkvZdG9acB0xqUzwV2Kx6gmZmtNf/z3szMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyuqI4lF0paSLpZ0n6R7Je0jaZikKyXNz89DK/VPkLRA0jxJB1bK95R0Z552hiR1Yn3MzOwlndpj+Rbwy4jYGXg9cC8wFZgTEWOBOXkcSbsAk4BdgfHAmZIG5fmcBUwBxubH+L5cCTMzW1OfJxZJQ4B/BL4LEBHPRcTjwERgeq42HTg4D08EZkbEsxGxEFgA7CVpBDAkIm6IiABmVNqYmVmHdGKP5ZXAcuB7km6TdK6kzYBtI2IpQH7eJtcfCSyqtF+cy0bm4fryNUiaImmupLnLly8vuzZmZraaTiSWwcAewFkR8UbgafJhryYa9ZtEi/I1CyPOiYhxETFu+PDh3Y3XzMy6oROJZTGwOCJuzOMXkxLNsnx4i/z8SKX+6Er7UcCSXD6qQbmZmXVQnyeWiPgzsEjSa3LR/sA9wGxgci6bDFySh2cDkyRtLGlHUif9Tflw2UpJe+ezwY6stDEzsw4Z3KHlfhK4UNJGwAPAR0hJbpako4GHgUMBIuJuSbNIyWcVcGxEPJ/ncwxwPrApcHl+mJlZB3UksUTE7cC4BpP2b1J/GjCtQflcYLeiwZmZWY+0lVjyIahPAmOqbSLiPb0TlpmZ9Vft7rH8jPS/k0uBF3otGjMz6/faTSzPRMQZvRqJmZmtF9pNLN+SdCLwK+DZWmFE3NorUZmZWb/VbmJ5HXAE8HZeOhQWedzMzOxF7SaW9wKvjIjnejMYMzPr/9r9g+QfgC17MQ4zM1tPtLvHsi1wn6SbWb2Pxacbm5nZatpNLCf2ahRmZrbeaCuxRMQ1vR2ImZmtH9r95/1KXrok/UbAhsDTETGktwIzM7P+qd09li2q45IOBvbqjYDMzKx/W6vL5kfEz/B/WMzMrIF2D4UdUhndgHRl4oZ3azQzs4Gt3bPCJlSGVwEPAhOLR2NmZv1eu30sH+ntQMzMbP3QMrFI+nKLyRERXy0cj5mZ9XNd7bE83aBsM+BoYCvAicXMzFbTMrFExDdrw5K2AI4j3Z9+JvDNZu3MzGzg6rKPRdIw4NPA4cB0YI+IeKy3AzMzs/6pqz6WrwOHAOcAr4uIp/okKjMz67e6+oPkZ4DtgC8BSyQ9mR8rJT3Z++GZmVl/01Ufy1r9M9/MzAYuJw4zMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMrqmOJRdIgSbdJuiyPD5N0paT5+Xlope4JkhZImifpwEr5npLuzNPOkKROrIuZmb2kk3ssxwH3VsanAnMiYiwwJ48jaRdgErArMB44U9Kg3OYsYAowNj/G903oZmbWTEcSi6RRwLuBcyvFE0mX5Sc/H1wpnxkRz0bEQmABsJekEcCQiLghIgKYUWljZmYd0qk9ltOBzwMvVMq2jYilAPl5m1w+ElhUqbc4l43Mw/Xla5A0RdJcSXOXL19eZAXMzKyxPk8skv4ZeCQibmm3SYOyaFG+ZmHEORExLiLGDR8+vM3FmpnZ2ujyDpK9YD/gPZIOAjYBhki6AFgmaURELM2HuR7J9RcDoyvtRwFLcvmoBuVmZtZBfb7HEhEnRMSoiBhD6pS/KiI+DMwGJudqk4FL8vBsYJKkjSXtSOqkvykfLlspae98NtiRlTZmZtYhndhjaeZkYJako4GHgUMBIuJuSbOAe4BVwLER8XxucwxwPrApcHl+mJlZB3U0sUTE1cDVefgvwP5N6k0DpjUonwvs1nsRmplZd/mf92ZmVpQTi5mZFeXEYmZmRTmxmJlZUU4sZmZWlBOLmZkV5cRiZmZFObGYmVlRTixmZlaUE4uZmRXlxGJmZkU5sZiZWVFOLGZmVpQTi5mZFeXEYmZmRTmxmJlZUU4sZmZWlBOLmZkV5cRiZmZFObGYmVlRTixmZlaUE4uZmRXlxGJmZkU5sZiZWVFOLGZmVpQTi5mZFeXEYmZmRTmxmJlZUU4sZmZWlBOLmZkV5cRiZmZFObGYmVlRfZ5YJI2W9BtJ90q6W9JxuXyYpCslzc/PQyttTpC0QNI8SQdWyveUdGeedoYk9fX6mJnZ6jqxx7IK+ExEvBbYGzhW0i7AVGBORIwF5uRx8rRJwK7AeOBMSYPyvM4CpgBj82N8X66ImZmtqc8TS0QsjYhb8/BK4F5gJDARmJ6rTQcOzsMTgZkR8WxELAQWAHtJGgEMiYgbIiKAGZU2ZmbWIR3tY5E0BngjcCOwbUQshZR8gG1ytZHAokqzxblsZB6uL2+0nCmS5kqau3z58qLrYGZmq+tYYpG0OfBj4PiIeLJV1QZl0aJ8zcKIcyJiXESMGz58ePeDNTOztnUksUjakJRULoyIn+TiZfnwFvn5kVy+GBhdaT4KWJLLRzUoNzOzDurEWWECvgvcGxGnVibNBibn4cnAJZXySZI2lrQjqZP+pny4bKWkvfM8j6y0MTOzDhncgWXuBxwB3Cnp9lz2BeBkYJako4GHgUMBIuJuSbOAe0hnlB0bEc/ndscA5wObApfnh5mZdVCfJ5aIuI7G/SMA+zdpMw2Y1qB8LrBbuejMzKyn/M97MzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiMTOzopxYzMysKCcWMzMryonFzMyKcmIxM7OinFjMzKwoJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJxczMinJiWQsTJkzodAhmZussJxYzMyvKicXMzIpyYjEzs6KcWMzMrCgnFjMzK8qJZS1NmDDBZ4eZmTXgxGJmZkU5sfSQ91rMzFbnxGJmZkUN7nQAPSVpPPAtYBBwbkSc3NcxVPdaLr300r5evJnZOqVfJxZJg4BvA+8AFgM3S5odEfd0KqZGh8ZqyWbChAlOPGa23uvXiQXYC1gQEQ8ASJoJTAQ6llgaqSabnvTJVBNUd9vV2jRKbO0mvFbzMDOr6e+JZSSwqDK+GHhTfSVJU4ApefQpSfPWcnlbA4+uZdsek9RsUsu4qu2azaPFvHtUlw6/Zi04ru5bV2NzXN23NrHt0G7F/p5YGn3DxRoFEecA5/R4YdLciBjX0/mUtq7GBetubI6r+9bV2BxX9/V2bP39rLDFwOjK+ChgSYdiMTMz+n9iuRkYK2lHSRsBk4DZHY7JzGxA69eHwiJilaRPAFeQTjc+LyLu7sVF9vhwWi9ZV+OCdTc2x9V962psjqv7ejU2RazRJWFmZrbW+vuhMDMzW8c4sZiZWVEDKrFIGi9pnqQFkqY2mC5JZ+Tpd0jao6u2koZJulLS/Pw8tDLthFx/nqQDOxDb1yXdl+v/VNKWuXyMpL9Juj0/zu7juE6S9KfK8g/q7mvWS3H9sBLTg5Ju7+PX6zxJj0i6q67NurCNNYut09tYs7g6vY01i6vH21hPYpM0WtJvJN0r6W5Jx1XaFNnOAIiIAfEgde7/EXglsBHwB2CXujoHAZeT/h+zN3BjV22BrwFT8/BU4JQ8vEuutzGwY24/qI9jeycwOA+fUoltDHBXB1+zk4DPNlheW69Zb8VV1/6bwJf76vXK0/4R2KN+WZ3exrqIrWPbWBdxdWwbaxVXT7exAtv/CGCPPLwFcD8Fv8tqj4G0x/Li5V8i4jmgdvmXqonAjEh+D2wpaUQXbScC0/PwdODgSvnMiHg2IhYCC/J8+iy2iPhVRKzK7X9P+p9Pd/TWa9ZMu69Zr8YlScAHgIu6iLdkXETEtcCKBvPt9DbWNLYOb2OtXrNm+mIb6zKuHmxjPYotIpZGxK05xpXAvaQrmNTa9HQ7AwbWobBGl38Z2WadVm23jYilAPl5m24sr7djq/oo6RdMzY6SbpN0jaS3dCCuT+Rd9PMqu9ztrktvv15vAZZFxPxKWW+/Xq10ehtrV19vY13p1DbWjrXdxorFJmkM8EbgxlxUYjsDBlZiaefyL83qtHXpmLVYXnfqrnVskr4IrAIuzEVLge0j4o3Ap4EfSBrSh3GdBbwKeEOO5ZvdWF5vxlVzGKv/kuyL12tt9NU21nUgndnGWunkNtaOtd3GisQmaXPgx8DxEfFkF7F2ez0HUmJp5/Ivzeq0arustvubnx/pxvJ6OzYkTQb+GTg88gHTvEv7lzx8C+mY6av7Kq6IWBYRz0fEC8D/8tJudbuvWW++XoOBQ4Af1sr66PVqpdPbWEsd3Maa6vA21lIPt7EexyZpQ1JSuTAiflKpU2I7e3GFBsSDdJWBB0idT7UOr13r6ryb1Tu8buqqLfB1Vu/w+loe3pXVO7weoHnHam/FNp50C4HhdfMaXouF1AH4J2BYH8Y1otL+30jHb9t+zXorrsprdk1fv16V6WNYsyO6o9tYF7F1bBvrIq6ObWOt4urpNlZg+xcwAzi9wXx7vJ29OK9WE9e3B+lMiftJvwa+mMs+Dny88qJ/O0+/ExjXqm0u3wqYA8zPz8Mq076Y688D3tWB2BaQjo3enh9n5/L3AXfnjeVWYEIfx/X9XPcO0rXdRnT3NeuNuPK082vzqJT11et1EemQyN9JvxKPXoe2sWaxdXobaxZXp7exhnGV2MZ6EhvwZtJhrDsq79lBJbeziPAlXczMrKyB1MdiZmZ9wInFzMyKcmIxM7OinFjMzKwoJxYzMyvKicUGDEmnSTq+Mn6FpHMr49+U9Om1nPdbJV3WpPyJylVrf71WwZv1I04sNpD8DtgXQNIGwNakP3/V7Atc386MJA3qxnJ/GxFvyI8D6ubTr28PbtaIE4sNJNeTEwspodwFrJQ0VNLGwGuB2yTtny8GeGe+gOHGAPn+GV+WdB1waL4nxn15/JB2g5B0lKQfSboU+JWkzfJybs7LnZjrbSppZr6Q4g8l3ShpXJ72VGV+75d0fh4eLunHeV43S9ovl5+Ul3G1pAckfarS/si8jD9I+r6kLSQtzJf+QNKQvO4brt3LbgONfy3ZgBERSyStkrQ9KcHcQLpK6z7AE6R/I29A+mf0/hFxv6QZwDHA6Xk2z0TEmyVtQvqH8ttJ/z7/Ic29RfmGTsCPSJfr2AfYPSJWSPpP4KqI+KjSjbJuyofM/hX4a0TsLml30j+yu/It4LSIuC6v5xWkhAmwM/A20n045kk6i3Q9qi8C+0XEo5KGRcRKSVeTLgvyM2AS8OOI+HsbyzfzHosNOLW9llpiuaEy/jvgNcDCiLg/159OumlTTS2B7JzrzY90+YoLWiyzeihsWi67MiJq9+t4JzA1J5+rgU2A7fNyLwCIiDtIia8rBwD/k+c1GxgiaYs87eeRLnb4KOkCg9uSEuPFuYxKTOcCH8nDHwG+18ayzQDvsdjAU+tneR3pUNgi4DPAk8B5NL5EeNXTleGeXA+pOh8B74uIedUKkloto1q+SWV4A2CfiPhbg3k9Wyl6nvT5V6NlRMT1SrfL/SfSBQfvqq9j1oz3WGyguZ50ifcVkS6rvgLYknRo6gbgPmCMpJ1y/SOAaxrM5z7SjZlelccP60FMVwCfVP72l/TGXH4tcHgu2w3YvdJmmaTX5pMQ3lsp/xXwidqIpDd0sew5wAckbZXrD6tMm0G6mKL3VqxbnFhsoLmTdDbY7+vKnoiIRyPiGdKhnx9JuhN4ATi7fia53hTg57nz/qEexPRVYEPgDkl35XFIN6vaXNIdwOeBmyptpgKXAVeRrqJb8ylgXO6Mv4d0xdumIuJuYBpwjaQ/AKdWJl8IDGXtbp9rA5ivbmzWT+QO9c9GxNw+Wt77gYkRcURfLM/WH+5jMbM1SPpv4F2k+36YdYv3WMzMrCj3sZiZWVFOLGZmVpQTi5mZFeXEYmZmRTmxmJlZUf8Hhb9M4k4ACqoAAAAASUVORK5CYII=",
      "text/plain": [
       "<Figure size 432x288 with 1 Axes>"
      ]
     },
     "metadata": {
      "needs_background": "light"
     },
     "output_type": "display_data"
    }
   ],
   "source": [
    "# 作为对比, 分析一下 chaoyi 学长支持的 MIMIC CXR 的词频\n",
    "mimic_path = '/remote-home/weixionglin/vlp/Match/Preprocess/Frequency/csv_cache/word_count_per_case_percentage.json'\n",
    "with open(mimic_path, 'r') as f:\n",
    "    mimic = json.load(f)\n",
    "\n",
    "mimic_frequency = list(mimic.values())\n",
    "plt.hist([fre for fre in mimic_frequency if fre < 1.0], bins=200, color='black', alpha=0.7)\n",
    "plt.xlabel('Word Frequency')\n",
    "plt.ylabel('Num')\n",
    "plt.title('Word Frequency of MIMIC CXR')\n",
    "plt.show()\n",
    "\n",
    "\n",
    "mimic_frequency = list(mimic.values())\n",
    "plt.hist([fre for fre in mimic_frequency if fre < 0.02], bins=200, color='black', alpha=0.7)\n",
    "plt.xlabel('Word Frequency')\n",
    "plt.ylabel('Num')\n",
    "plt.title('Word Frequency of MIMIC CXR per case')\n",
    "plt.show()\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 69,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "0.0\n",
      "0.36695742935957654\n"
     ]
    }
   ],
   "source": [
    "thresh = 1e-5\n",
    "\n",
    "print(len(df_train_fre.loc[df_train_fre['frequency'] < thresh]) / len(df_train_fre))\n",
    "print(len([fre for fre in mimic_frequency if fre < thresh]) / len(mimic_frequency))"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "## 随机采样分析 low-frequency words 的组成"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 50,
   "metadata": {},
   "outputs": [],
   "source": [
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "#count=one: 5949, 占比: 0.5582246410809797\n"
     ]
    }
   ],
   "source": [
    "# 从 df 中抽取 count=1 的部分\n",
    "wordcount_one = df.loc[df['count'] == 1, 'word'].tolist()\n",
    "\n",
    "num_count1 = len(wordcount_one)\n",
    "print(f\"#count=one: {num_count1}, 占比: {num_count1 / len(df)}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "['indictative',\n",
       " 'phi',\n",
       " 'c5c7',\n",
       " 'tomographyhigh',\n",
       " 'isthmocele',\n",
       " 'tev',\n",
       " 'nopreviou',\n",
       " 'incompetent',\n",
       " 'nonseptate',\n",
       " 'gastrosplenic',\n",
       " 'femoralcanal',\n",
       " 'livermas',\n",
       " 'knot',\n",
       " 'reductionin',\n",
       " 'tm',\n",
       " 'entrapment',\n",
       " 'pdwse',\n",
       " 'enhancedmriangiography',\n",
       " 'gtube',\n",
       " 'erector',\n",
       " 'xj',\n",
       " 'lowintermediate',\n",
       " 'montant',\n",
       " 'road',\n",
       " 'hepatocyte',\n",
       " 'proportion',\n",
       " 'rich',\n",
       " 'practitioner',\n",
       " 'hyperkinetic',\n",
       " 'fibroin',\n",
       " 'vaginale',\n",
       " 'similarhigh',\n",
       " 'broach',\n",
       " 'supraclenoid',\n",
       " 'myofibroblastic',\n",
       " 'sleep',\n",
       " 'unlabeled',\n",
       " 'inoculation',\n",
       " 'arrows90',\n",
       " 'lvh',\n",
       " 'subcentimetrical',\n",
       " 'ecrb',\n",
       " 'imagem',\n",
       " 'imagingi',\n",
       " 'okay',\n",
       " 'sponge',\n",
       " 'locclusion',\n",
       " 'ehl',\n",
       " 'thromboembolic',\n",
       " 'realce',\n",
       " 'retroclavicular',\n",
       " 'eventual',\n",
       " 'lco',\n",
       " 'tvalue3',\n",
       " 'intraduralextramedullary',\n",
       " 'igg4kd',\n",
       " 'communicante',\n",
       " 'subperiosteal',\n",
       " 'orthos',\n",
       " 'insallsalvati',\n",
       " 'lactophenol',\n",
       " 'thispathology',\n",
       " 'tbw',\n",
       " 'bourgeonnement',\n",
       " 'vvi',\n",
       " 'tubulocystic',\n",
       " 'gastrohepatic',\n",
       " 'neotympanic',\n",
       " 'esp',\n",
       " 'footling',\n",
       " 'cccholedochal',\n",
       " 'lightened',\n",
       " 'instruction',\n",
       " 'suprachoroidal',\n",
       " 'gleevec',\n",
       " 'fistulagram',\n",
       " 'typico',\n",
       " 'ki67',\n",
       " 'interested',\n",
       " 'forschungsinstitut',\n",
       " 'shielding',\n",
       " 'rolandic',\n",
       " 'chimpanzee',\n",
       " 'cea',\n",
       " 'thethird',\n",
       " 'stape',\n",
       " 'vvecmo',\n",
       " 'antibiotherapy',\n",
       " 'compatibility',\n",
       " 'garden',\n",
       " 'schwanome',\n",
       " 'minilaparotomy',\n",
       " 'bacterium',\n",
       " 'decompensation',\n",
       " 'klein',\n",
       " 'pulpotomised',\n",
       " 'underfascia',\n",
       " 'philadelphia',\n",
       " 'reviewer',\n",
       " 'livernote',\n",
       " 'anonyma',\n",
       " 'anastamosi',\n",
       " 'liverpattern',\n",
       " 'coma',\n",
       " 'pointof',\n",
       " 'mettant',\n",
       " 'xylosoxidan',\n",
       " 'arrowchamber',\n",
       " 'case3',\n",
       " 'cholangiopancreaticogram',\n",
       " 'aw',\n",
       " 'bpco',\n",
       " 'hyperluscent',\n",
       " 'mancontrastenhanced',\n",
       " 'mobilization',\n",
       " 'multigen',\n",
       " 'thecerebellum',\n",
       " 'istanbul',\n",
       " 'consistant',\n",
       " 'biopsied',\n",
       " 'cortriatriatum',\n",
       " 'nonvascular',\n",
       " 'scopy',\n",
       " 'mat',\n",
       " 'endarterectomy',\n",
       " 'eosin',\n",
       " 'manned',\n",
       " 'mentioning',\n",
       " 'centrum',\n",
       " 'bei',\n",
       " 'transgluteal',\n",
       " 'greywhite',\n",
       " 'labiomental',\n",
       " 'retropubic',\n",
       " 'gct',\n",
       " 'arteriosum',\n",
       " 'intraarterial',\n",
       " 'ladlater',\n",
       " 'macrocephaly',\n",
       " 'pericolic',\n",
       " 'endsystole',\n",
       " 'subjacent',\n",
       " 'aaorta',\n",
       " 'invade',\n",
       " 'reconstructive',\n",
       " 'mediaadventitia',\n",
       " 'padua',\n",
       " 'jshaped',\n",
       " 'bound',\n",
       " 'relevant',\n",
       " 'fovealcenter',\n",
       " 'earthquake',\n",
       " 'thympani',\n",
       " 'hypopituitarism',\n",
       " 'bowell',\n",
       " 'mcd',\n",
       " 'admissionreconstituted',\n",
       " 'offending',\n",
       " 'epn',\n",
       " 'pillar',\n",
       " 'contrastenhancement',\n",
       " 'older',\n",
       " 'tor',\n",
       " 'switzerland',\n",
       " 'sellasuprasellar',\n",
       " 'interluminal',\n",
       " 'septummeasurement',\n",
       " 'occipitotemporal',\n",
       " 'misdirectionciliaryblockmalignant',\n",
       " 'sydney',\n",
       " 'videothoracoscopic',\n",
       " 'hemiplegia',\n",
       " 'a13yearold',\n",
       " 'arcade',\n",
       " 'microvessel',\n",
       " 'td',\n",
       " 'hitch',\n",
       " 'ti',\n",
       " 'cabea',\n",
       " 'asparaginase',\n",
       " 't1image',\n",
       " 'ptgbd',\n",
       " 'apcanaldiameter',\n",
       " 'prdida',\n",
       " 'nonossified',\n",
       " 'aliasing',\n",
       " 'dsci',\n",
       " 'dissecando',\n",
       " 'tgtv',\n",
       " 'omc',\n",
       " 'onecolor',\n",
       " 'tru',\n",
       " 'n32',\n",
       " 'gonial',\n",
       " 'crowding',\n",
       " 'voxellevel',\n",
       " 'california',\n",
       " 'm46year',\n",
       " 'nonpure',\n",
       " 'striatocapsular']"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "random.sample(wordcount_one, k=200)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.5"
  },
  "orig_nbformat": 4
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
